diff --git a/.ci/test-coverage.yml b/.ci/test-coverage.yml index e0800bb6bf3..6cd442850b3 100644 --- a/.ci/test-coverage.yml +++ b/.ci/test-coverage.yml @@ -840,149 +840,6 @@ jobs: - name: codecov run: ./codecov -t ${{settings.CODECOV_TOKEN.access_token}} -C ${{ ci.sha }} -B ${{ ci.head_ref }} -f build/lcov.info - linux-gcc-riscv64-rvv: - name: linux-gcc-riscv64-rvv - strategy: - matrix: - OPENMP: ['OFF', 'ON'] - - runs-on: - pool-name: docker - container: - image: bkci/ci:ubuntu - steps: - - name: checkout - checkout: self - with: - strategy: FRESH_CHECKOUT - enableSubmodule: false - enableGitLfs: false - - - name: install-deps - run: | - apt-get update - apt-get install -y lcov libcapstone4 libglib2.0-0 - curl https://uploader.codecov.io/verification.gpg | gpg --no-default-keyring --keyring trustedkeys.gpg --import - curl -Os https://uploader.codecov.io/latest/linux/codecov - curl -Os https://uploader.codecov.io/latest/linux/codecov.SHA256SUM - curl -Os https://uploader.codecov.io/latest/linux/codecov.SHA256SUM.sig - gpgv codecov.SHA256SUM.sig codecov.SHA256SUM - shasum -a 256 -c codecov.SHA256SUM - chmod +x codecov - - - name: cache-qemu - id: cache-qemu - uses: cache@1.* - with: - cachePaths: qemu-install - cacheKey: qemu-riscv64-install-20230624-1 - - - name: checkout-qemu - if: steps.cache-qemu.outputs.cacheHit != 'true' - checkout: https://github.com/qemu/qemu.git - with: - pullType: COMMIT_ID - refName: b455ce4c2f300c8ba47cba7232dd03261368a4cb - localPath: qemu - enableSubmodule: false - enableGitLfs: false - - - name: qemu - if: steps.cache-qemu.outputs.cacheHit != 'true' - run: | - echo 'deb-src http://mirrors.cloud.tencent.com/debian bullseye main' | tee -a /etc/apt/sources.list - echo 'deb-src http://mirrors.cloud.tencent.com/debian bullseye-updates main' | tee -a /etc/apt/sources.list - apt-get update - apt-get build-dep -y qemu - apt-get install -y python3-pip - python3 -m pip install --upgrade pip - apt-get remove -y python3-setuptools - pip3 install -U setuptools - cd qemu - wget https://raw.githubusercontent.com/nihui/ncnn-assets/master/qemu-patches/0007-linux-user-Expose-risc-v-V-isa-bit-in-get_elf_hwcap.patch - patch -p1 -i 0007-linux-user-Expose-risc-v-V-isa-bit-in-get_elf_hwcap.patch - ./configure --prefix=${{ci.workspace}}/qemu-install --target-list=riscv64-linux-user --disable-system - make -j$(nproc) - make install - - - name: cache-rv64gcv - id: cache-rv64gcv - uses: cache@1.* - with: - cachePaths: rv64gcv-install - cacheKey: rv64gcv-linux-install-20221029-1 - - - name: checkout-riscv-gnu-toolchain - if: steps.cache-rv64gcv.outputs.cacheHit != 'true' - checkout: https://github.com/riscv/riscv-gnu-toolchain.git - with: - pullType: COMMIT_ID - refName: da01ba455ce3802ffa84fdca3a089079996dbfc3 - localPath: riscv-gnu-toolchain - enableSubmodule: false - enableGitLfs: false - - - name: riscv-gnu-toolchain - if: steps.cache-rv64gcv.outputs.cacheHit != 'true' - run: | - apt-get update - apt-get install -y autoconf automake autotools-dev curl python3 libmpc-dev libmpfr-dev libgmp-dev gawk build-essential bison flex texinfo gperf libtool patchutils bc zlib1g-dev libexpat-dev device-tree-compiler - cd riscv-gnu-toolchain - git submodule update --init --recursive --depth 1 glibc - git submodule update --init --recursive --depth 1 newlib - #git submodule update --init --recursive --depth 1 riscv-binutils - #git submodule update --init --recursive --depth 1 riscv-gcc - git submodule update --init --recursive --depth 1 riscv-dejagnu - git submodule update --init --recursive --depth 1 riscv-gdb - rm -rf riscv-binutils - git clone -b binutils-2_39-branch https://sourceware.org/git/binutils-gdb.git riscv-binutils - rm -rf riscv-gcc - git clone -b riscv-gcc-rvv-next https://github.com/riscv-collab/riscv-gcc.git riscv-gcc - cd riscv-gcc - git checkout 8a0c1b106f01c455a8fb478cfe52d859a69020fd - cd .. - sed -i '/__OBSOLETE_MATH/d' newlib/newlib/libm/common/math_errf.c - ./configure --prefix=${{ci.workspace}}/rv64gcv-install --with-arch=rv64gcv_zfh - make linux -j$(nproc) - find ${{ci.workspace}}/rv64gcv-install -type f | xargs -i strip -g {} || true - - - name: build - run: | - export RISCV_ROOT_PATH=${{ci.workspace}}/rv64gcv-install - mkdir build && cd build - cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/riscv64-unknown-linux-gnu.toolchain.cmake -DCMAKE_BUILD_TYPE=debug -DCMAKE_C_FLAGS="-O1" -DCMAKE_CXX_FLAGS="-O1" -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_RVV=ON -DNCNN_OPENMP=${{matrix.OPENMP}} -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. - cmake --build . -j 4 - - name: test-vlen128 - run: | - export PATH=${{ci.workspace}}/qemu-install/bin:$PATH - cd build - TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-cpu;rv64,v=true,Zfh=true,x-zvfh=true,vlen=128,elen=64,vext_spec=v1.0;-L;${{ci.workspace}}/rv64gcv-install/sysroot" ctest --output-on-failure -j 4 - - name: lcov-collect-vlen128 - run: | - cd build - lcov --gcov-tool ${{ci.workspace}}/rv64gcv-install/bin/riscv64-unknown-linux-gnu-gcov -d ./src -c -o lcov.info - lcov -r lcov.info '/usr/*' -o lcov.info - lcov -r lcov.info '*/build/*' -o lcov.info - lcov -r lcov.info '*/rv64gcv-install/*' -o lcov.info - lcov --list lcov.info - - name: codecov-vlen128 - run: ./codecov -t ${{settings.CODECOV_TOKEN.access_token}} -C ${{ ci.sha }} -B ${{ ci.head_ref }} -f build/lcov.info - - name: test-vlen256 - run: | - export PATH=${{ci.workspace}}/qemu-install/bin:$PATH - cd build - TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-cpu;rv64,v=true,Zfh=true,x-zvfh=true,vlen=256,elen=64,vext_spec=v1.0;-L;${{ci.workspace}}/rv64gcv-install/sysroot" ctest --output-on-failure -j 4 - - name: lcov-collect-vlen256 - run: | - cd build - lcov --gcov-tool ${{ci.workspace}}/rv64gcv-install/bin/riscv64-unknown-linux-gnu-gcov -d ./src -c -o lcov.info - lcov -r lcov.info '/usr/*' -o lcov.info - lcov -r lcov.info '*/build/*' -o lcov.info - lcov -r lcov.info '*/rv64gcv-install/*' -o lcov.info - lcov --list lcov.info - - name: codecov-vlen256 - run: ./codecov -t ${{settings.CODECOV_TOKEN.access_token}} -C ${{ ci.sha }} -B ${{ ci.head_ref }} -f build/lcov.info - linux-gcc-loongarch64: name: linux-gcc-loongarch64 strategy: diff --git a/.github/workflows/android.yml b/.github/workflows/android.yml index 4bfa4adc854..424a04c15a5 100644 --- a/.github/workflows/android.yml +++ b/.github/workflows/android.yml @@ -9,6 +9,7 @@ on: - 'src/*' - 'src/layer/*' - 'src/layer/arm/**' + - 'src/layer/riscv/**' - 'src/layer/x86/**' - 'src/layer/vulkan/**' pull_request: @@ -20,6 +21,7 @@ on: - 'src/*' - 'src/layer/*' - 'src/layer/arm/**' + - 'src/layer/riscv/**' - 'src/layer/x86/**' - 'src/layer/vulkan/**' concurrency: @@ -64,6 +66,11 @@ jobs: mkdir build-x86_64 && cd build-x86_64 cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DANDROID_ABI="x86_64" .. cmake --build . -j $(nproc) + - name: riscv64 + run: | + mkdir build-riscv64 && cd build-riscv64 + cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DANDROID_ABI="riscv64" .. + cmake --build . -j $(nproc) - name: armeabi-v7a-shared run: | @@ -85,6 +92,11 @@ jobs: mkdir build-x86_64-shared && cd build-x86_64-shared cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DANDROID_ABI="x86_64" -DNCNN_SHARED_LIB=ON .. cmake --build . -j $(nproc) + - name: riscv64-shared + run: | + mkdir build-riscv64-shared && cd build-riscv64-shared + cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DANDROID_ABI="riscv64" -DNCNN_SHARED_LIB=ON .. + cmake --build . -j $(nproc) ndk-r16b: runs-on: ubuntu-latest diff --git a/.github/workflows/elf-riscv32-cpu-gcc.yml b/.github/workflows/elf-riscv32.yml similarity index 96% rename from .github/workflows/elf-riscv32-cpu-gcc.yml rename to .github/workflows/elf-riscv32.yml index 88ec5efb53a..854ac8e8f22 100644 --- a/.github/workflows/elf-riscv32-cpu-gcc.yml +++ b/.github/workflows/elf-riscv32.yml @@ -1,9 +1,9 @@ -name: elf-riscv32-cpu-gcc +name: elf-riscv32 on: push: branches: [master] paths: - - '.github/workflows/elf-riscv32-cpu-gcc.yml' + - '.github/workflows/elf-riscv32.yml' - 'toolchains/riscv32-unknown-elf.toolchain.cmake' - 'CMakeLists.txt' - 'cmake/**' @@ -14,7 +14,7 @@ on: pull_request: branches: [master] paths: - - '.github/workflows/elf-riscv32-cpu-gcc.yml' + - '.github/workflows/elf-riscv32.yml' - 'toolchains/riscv32-unknown-elf.toolchain.cmake' - 'CMakeLists.txt' - 'cmake/**' @@ -23,7 +23,7 @@ on: - 'src/layer/riscv/**' - 'tests/**' concurrency: - group: elf-riscv32-cpu-gcc-${{ github.ref }} + group: elf-riscv32-${{ github.ref }} cancel-in-progress: true permissions: contents: read diff --git a/.github/workflows/elf-riscv64-cpu-gcc.yml b/.github/workflows/elf-riscv64.yml similarity index 96% rename from .github/workflows/elf-riscv64-cpu-gcc.yml rename to .github/workflows/elf-riscv64.yml index 0b7c584cbb6..0e0e251065b 100644 --- a/.github/workflows/elf-riscv64-cpu-gcc.yml +++ b/.github/workflows/elf-riscv64.yml @@ -1,9 +1,9 @@ -name: elf-riscv64-cpu-gcc +name: elf-riscv64 on: push: branches: [master] paths: - - '.github/workflows/elf-riscv64-cpu-gcc.yml' + - '.github/workflows/elf-riscv64.yml' - 'toolchains/riscv64-unknown-elf.toolchain.cmake' - 'CMakeLists.txt' - 'cmake/**' @@ -14,7 +14,7 @@ on: pull_request: branches: [master] paths: - - '.github/workflows/elf-riscv64-cpu-gcc.yml' + - '.github/workflows/elf-riscv64.yml' - 'toolchains/riscv64-unknown-elf.toolchain.cmake' - 'CMakeLists.txt' - 'cmake/**' @@ -23,7 +23,7 @@ on: - 'src/layer/riscv/**' - 'tests/**' concurrency: - group: elf-riscv64-cpu-gcc-${{ github.ref }} + group: elf-riscv64-${{ github.ref }} cancel-in-progress: true permissions: contents: read diff --git a/.github/workflows/linux-aarch64-cpu-gcc.yml b/.github/workflows/linux-aarch64.yml similarity index 96% rename from .github/workflows/linux-aarch64-cpu-gcc.yml rename to .github/workflows/linux-aarch64.yml index 0c1032bf9c0..567575336f3 100644 --- a/.github/workflows/linux-aarch64-cpu-gcc.yml +++ b/.github/workflows/linux-aarch64.yml @@ -1,9 +1,9 @@ -name: linux-aarch64-cpu-gcc +name: linux-aarch64 on: push: branches: [master] paths: - - '.github/workflows/linux-aarch64-cpu-gcc.yml' + - '.github/workflows/linux-aarch64.yml' - 'toolchains/aarch64-linux-gnu.toolchain.cmake' - 'CMakeLists.txt' - 'cmake/**' @@ -14,7 +14,7 @@ on: pull_request: branches: [master] paths: - - '.github/workflows/linux-aarch64-cpu-gcc.yml' + - '.github/workflows/linux-aarch64.yml' - 'toolchains/aarch64-linux-gnu.toolchain.cmake' - 'CMakeLists.txt' - 'cmake/**' @@ -23,13 +23,13 @@ on: - 'src/layer/arm/**' - 'tests/**' concurrency: - group: linux-aarch64-cpu-gcc-${{ github.ref }} + group: linux-aarch64-${{ github.ref }} cancel-in-progress: true permissions: contents: read jobs: - linux-gcc: + gcc-aarch64: runs-on: ubuntu-20.04 steps: - uses: actions/checkout@v4 @@ -98,7 +98,7 @@ jobs: cd build-simplestl-simplemath TESTS_EXECUTABLE_LOADER=qemu-aarch64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/aarch64-linux-gnu" ctest --output-on-failure -j $(nproc) - linux-gcc-arm82: + gcc-arm82: runs-on: ubuntu-20.04 steps: - uses: actions/checkout@v4 @@ -156,7 +156,7 @@ jobs: cd build-noint8 TESTS_EXECUTABLE_LOADER=qemu-aarch64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/aarch64-linux-gnu" ctest --output-on-failure -j $(nproc) - linux-gcc-arm86: + gcc-arm86: runs-on: ubuntu-22.04 steps: - uses: actions/checkout@v4 diff --git a/.github/workflows/linux-arm-cpu-gcc.yml b/.github/workflows/linux-arm.yml similarity index 96% rename from .github/workflows/linux-arm-cpu-gcc.yml rename to .github/workflows/linux-arm.yml index 19d9c1cb370..a6836e813e2 100644 --- a/.github/workflows/linux-arm-cpu-gcc.yml +++ b/.github/workflows/linux-arm.yml @@ -1,9 +1,9 @@ -name: linux-arm-cpu-gcc +name: linux-arm on: push: branches: [master] paths: - - '.github/workflows/linux-arm-cpu-gcc.yml' + - '.github/workflows/linux-arm.yml' - 'toolchains/arm-linux-gnueabi.toolchain.cmake' - 'toolchains/arm-linux-gnueabihf.toolchain.cmake' - 'CMakeLists.txt' @@ -15,7 +15,7 @@ on: pull_request: branches: [master] paths: - - '.github/workflows/linux-arm-cpu-gcc.yml' + - '.github/workflows/linux-arm.yml' - 'toolchains/arm-linux-gnueabi.toolchain.cmake' - 'toolchains/arm-linux-gnueabihf.toolchain.cmake' - 'CMakeLists.txt' @@ -25,13 +25,13 @@ on: - 'src/layer/arm/**' - 'tests/**' concurrency: - group: linux-arm-cpu-gcc-${{ github.ref }} + group: linux-arm-${{ github.ref }} cancel-in-progress: true permissions: contents: read jobs: - linux-gcc-arm: + gcc-arm: runs-on: ubuntu-20.04 steps: - uses: actions/checkout@v4 @@ -89,7 +89,7 @@ jobs: cd build-noint8 TESTS_EXECUTABLE_LOADER=qemu-arm TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/arm-linux-gnueabi" ctest --output-on-failure -j $(nproc) - linux-gcc-armhf: + gcc-armhf: runs-on: ubuntu-20.04 steps: - uses: actions/checkout@v4 @@ -147,7 +147,7 @@ jobs: cd build-noint8 TESTS_EXECUTABLE_LOADER=qemu-arm TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/arm-linux-gnueabihf" ctest --output-on-failure -j $(nproc) - linux-gcc-armhf-vfpv3-d16: + gcc-armhf-vfpv3-d16: runs-on: ubuntu-20.04 steps: - uses: actions/checkout@v4 diff --git a/.github/workflows/linux-loongarch64-cpu-gcc.yml b/.github/workflows/linux-loongarch64.yml similarity index 91% rename from .github/workflows/linux-loongarch64-cpu-gcc.yml rename to .github/workflows/linux-loongarch64.yml index 5a23fdc4bc1..c6d3142d1df 100644 --- a/.github/workflows/linux-loongarch64-cpu-gcc.yml +++ b/.github/workflows/linux-loongarch64.yml @@ -1,9 +1,9 @@ -name: linux-loongarch64-cpu-gcc +name: linux-loongarch64 on: push: branches: [master] paths: - - '.github/workflows/linux-loongarch64-cpu-gcc.yml' + - '.github/workflows/linux-loongarch64.yml' - 'toolchains/loongarch64-linux-gnu.toolchain.cmake' - 'toolchains/loongarch64-unknown-linux-gnu.toolchain.cmake' - 'CMakeLists.txt' @@ -15,7 +15,7 @@ on: pull_request: branches: [master] paths: - - '.github/workflows/linux-loongarch64-cpu-gcc.yml' + - '.github/workflows/linux-loongarch64.yml' - 'toolchains/loongarch64-linux-gnu.toolchain.cmake' - 'toolchains/loongarch64-unknown-linux-gnu.toolchain.cmake' - 'CMakeLists.txt' @@ -25,13 +25,13 @@ on: - 'src/layer/loongarch/**' - 'tests/**' concurrency: - group: linux-loongarch64-cpu-gcc-${{ github.ref }} + group: linux-loongarch64-${{ github.ref }} cancel-in-progress: true permissions: contents: read jobs: - linux-gcc-loongarch64: + gcc-loongarch64: runs-on: [self-hosted, linux, centos] steps: diff --git a/.github/workflows/linux-mips-cpu-gcc.yml b/.github/workflows/linux-mips.yml similarity index 94% rename from .github/workflows/linux-mips-cpu-gcc.yml rename to .github/workflows/linux-mips.yml index f6e1e74792c..30856af821b 100644 --- a/.github/workflows/linux-mips-cpu-gcc.yml +++ b/.github/workflows/linux-mips.yml @@ -1,9 +1,9 @@ -name: linux-mips-cpu-gcc +name: linux-mips on: push: branches: [master] paths: - - '.github/workflows/linux-mips-cpu-gcc.yml' + - '.github/workflows/linux-mips.yml' - 'toolchains/mipsel-linux-gnu.toolchain.cmake' - 'toolchains/mipsisa32r6el-linux-gnu.toolchain.cmake' - 'CMakeLists.txt' @@ -15,7 +15,7 @@ on: pull_request: branches: [master] paths: - - '.github/workflows/linux-mips-cpu-gcc.yml' + - '.github/workflows/linux-mips.yml' - 'toolchains/mipsel-linux-gnu.toolchain.cmake' - 'toolchains/mipsisa32r6el-linux-gnu.toolchain.cmake' - 'CMakeLists.txt' @@ -25,13 +25,13 @@ on: - 'src/layer/mips/**' - 'tests/**' concurrency: - group: linux-mips-cpu-gcc-${{ github.ref }} + group: linux-mips-${{ github.ref }} cancel-in-progress: true permissions: contents: read jobs: - linux-gcc-mipsel: + gcc-mipsel: runs-on: ubuntu-20.04 steps: - uses: actions/checkout@v4 @@ -78,7 +78,7 @@ jobs: cd build TESTS_EXECUTABLE_LOADER=qemu-mipsel TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/mipsel-linux-gnu" ctest --output-on-failure -j $(nproc) - linux-gcc-mipsisa32r6el: + gcc-mipsisa32r6el: runs-on: ubuntu-20.04 steps: - uses: actions/checkout@v4 diff --git a/.github/workflows/linux-mips64-cpu-gcc.yml b/.github/workflows/linux-mips64.yml similarity index 95% rename from .github/workflows/linux-mips64-cpu-gcc.yml rename to .github/workflows/linux-mips64.yml index 890f1054d5b..df818bc9d0e 100644 --- a/.github/workflows/linux-mips64-cpu-gcc.yml +++ b/.github/workflows/linux-mips64.yml @@ -1,9 +1,9 @@ -name: linux-mips64-cpu-gcc +name: linux-mips64 on: push: branches: [master] paths: - - '.github/workflows/linux-mips64-cpu-gcc.yml' + - '.github/workflows/linux-mips64.yml' - 'toolchains/mips64el-linux-gnuabi64.toolchain.cmake' - 'toolchains/mipsisa64r6el-linux-gnuabi64.toolchain.cmake' - 'CMakeLists.txt' @@ -15,7 +15,7 @@ on: pull_request: branches: [master] paths: - - '.github/workflows/linux-mips64-cpu-gcc.yml' + - '.github/workflows/linux-mips64.yml' - 'toolchains/mips64el-linux-gnuabi64.toolchain.cmake' - 'toolchains/mipsisa64r6el-linux-gnuabi64.toolchain.cmake' - 'CMakeLists.txt' @@ -25,13 +25,13 @@ on: - 'src/layer/mips/**' - 'tests/**' concurrency: - group: linux-mips64-cpu-gcc-${{ github.ref }} + group: linux-mips64-${{ github.ref }} cancel-in-progress: true permissions: contents: read jobs: - linux-gcc-mips64el: + gcc-mips64el: runs-on: ubuntu-20.04 steps: - uses: actions/checkout@v4 @@ -78,7 +78,7 @@ jobs: cd build TESTS_EXECUTABLE_LOADER=qemu-mips64el TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/mips64el-linux-gnuabi64" ctest --output-on-failure -j $(nproc) - linux-gcc-mipsisa64r6el: + gcc-mipsisa64r6el: runs-on: ubuntu-20.04 steps: - uses: actions/checkout@v4 diff --git a/.github/workflows/linux-ppc64-cpu-gcc.yml b/.github/workflows/linux-ppc64.yml similarity index 96% rename from .github/workflows/linux-ppc64-cpu-gcc.yml rename to .github/workflows/linux-ppc64.yml index d266bd58f77..10d7b15a3da 100644 --- a/.github/workflows/linux-ppc64-cpu-gcc.yml +++ b/.github/workflows/linux-ppc64.yml @@ -1,9 +1,9 @@ -name: linux-ppc64-cpu-gcc +name: linux-ppc64 on: push: branches: [master] paths: - - '.github/workflows/linux-ppc64-cpu-gcc.yml' + - '.github/workflows/linux-ppc64.yml' - 'toolchains/powerpc64le-linux-gnu.toolchain.cmake' - 'CMakeLists.txt' - 'cmake/**' @@ -14,7 +14,7 @@ on: pull_request: branches: [master] paths: - - '.github/workflows/linux-ppc64-cpu-gcc.yml' + - '.github/workflows/linux-ppc64.yml' - 'toolchains/powerpc64le-linux-gnu.toolchain.cmake' - 'CMakeLists.txt' - 'cmake/**' @@ -23,13 +23,13 @@ on: - 'src/layer/x86/*' - 'tests/**' concurrency: - group: linux-ppc64-cpu-gcc-${{ github.ref }} + group: linux-ppc64-${{ github.ref }} cancel-in-progress: true permissions: contents: read jobs: - linux-gcc-ppc: + gcc-ppc: runs-on: ubuntu-20.04 steps: - uses: actions/checkout@v4 @@ -76,7 +76,7 @@ jobs: cd build TESTS_EXECUTABLE_LOADER=qemu-ppc TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/powerpc-linux-gnu" ctest --output-on-failure -j $(nproc) - linux-gcc-ppc64le: + gcc-ppc64le: runs-on: ubuntu-20.04 steps: - uses: actions/checkout@v4 @@ -123,7 +123,7 @@ jobs: cd build TESTS_EXECUTABLE_LOADER=qemu-ppc64le TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/powerpc64le-linux-gnu" ctest --output-on-failure -j $(nproc) - linux-gcc-power8le-vsx: + gcc-power8le-vsx: runs-on: ubuntu-20.04 steps: - uses: actions/checkout@v4 @@ -169,7 +169,7 @@ jobs: export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH cd build TESTS_EXECUTABLE_LOADER=qemu-ppc64le TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/powerpc64le-linux-gnu;-cpu;power8_v2.0" ctest --output-on-failure -j $(nproc) - linux-gcc-power9le-vsx: + gcc-power9le-vsx: runs-on: ubuntu-20.04 steps: - uses: actions/checkout@v4 diff --git a/.github/workflows/linux-riscv64-cpu-gcc.yml b/.github/workflows/linux-riscv64-cpu-gcc.yml deleted file mode 100644 index cfd9685b800..00000000000 --- a/.github/workflows/linux-riscv64-cpu-gcc.yml +++ /dev/null @@ -1,192 +0,0 @@ -name: linux-riscv64-cpu-gcc -on: - push: - branches: [master] - paths: - - '.github/workflows/linux-riscv64-cpu-gcc.yml' - - 'toolchains/riscv64-linux-gnu.toolchain.cmake' - - 'toolchains/riscv64-unknown-linux-gnu.toolchain.cmake' - - 'CMakeLists.txt' - - 'cmake/**' - - 'src/*' - - 'src/layer/*' - - 'src/layer/riscv/**' - - 'tests/**' - pull_request: - branches: [master] - paths: - - '.github/workflows/linux-riscv64-cpu-gcc.yml' - - 'toolchains/riscv64-linux-gnu.toolchain.cmake' - - 'toolchains/riscv64-unknown-linux-gnu.toolchain.cmake' - - 'CMakeLists.txt' - - 'cmake/**' - - 'src/*' - - 'src/layer/*' - - 'src/layer/riscv/**' - - 'tests/**' -concurrency: - group: linux-riscv64-cpu-gcc-${{ github.ref }} - cancel-in-progress: true -permissions: - contents: read - -jobs: - linux-gcc-riscv64: - runs-on: ubuntu-20.04 - steps: - - uses: actions/checkout@v4 - - - name: cache-qemu - id: cache-qemu - uses: actions/cache@v4 - with: - path: qemu-install - key: qemu-riscv64-install-20220502-4 - - name: install-qemu-build-deps - if: steps.cache-qemu.outputs.cache-hit != 'true' - run: | - sudo apt-get update - sudo apt-get install autoconf automake autotools-dev ninja-build - - name: checkout-qemu - if: steps.cache-qemu.outputs.cache-hit != 'true' - uses: actions/checkout@v4 - with: - repository: qemu/qemu - path: qemu - ref: f5643914a9e8f79c606a76e6a9d7ea82a3fc3e65 - - name: qemu - if: steps.cache-qemu.outputs.cache-hit != 'true' - run: | - cd qemu - wget https://raw.githubusercontent.com/nihui/ncnn-assets/master/qemu-patches/0007-linux-user-Expose-risc-v-V-isa-bit-in-get_elf_hwcap.patch - patch -p1 -i 0007-linux-user-Expose-risc-v-V-isa-bit-in-get_elf_hwcap.patch - ./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=riscv64-linux-user --disable-system - make -j$(nproc) - make install - - - name: riscv64-gnu-toolchain - run: | - sudo apt-get update - sudo apt-get install g++-riscv64-linux-gnu - - - name: configure - run: mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/riscv64-linux-gnu.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. - - name: build - run: cmake --build build -j $(nproc) - - - name: test - run: | - export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH - cd build - TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/riscv64-linux-gnu" ctest --output-on-failure -j $(nproc) - - linux-gcc-riscv64-c906: - runs-on: [self-hosted, linux, centos] - steps: - - uses: actions/checkout@v4 - - - name: configure - run: | - export RISCV_ROOT_PATH=/data/action/osd/Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.6.1 - mkdir build && cd build - cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/c906-v226.toolchain.cmake -DCMAKE_BUILD_TYPE=release -DNCNN_OPENMP=OFF -DNCNN_THREADS=OFF -DNCNN_RUNTIME_CPU=OFF -DNCNN_RVV=ON -DNCNN_SIMPLEOCV=ON -DNCNN_BUILD_EXAMPLES=ON -DNCNN_BUILD_TESTS=ON .. - - name: build - run: cmake --build build -j 4 - - - name: test - run: | - export PATH=/data/action/osd/xuantie-qemu-x86_64-Ubuntu-18.04-20230413-0706/bin:$PATH - cd build - TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-cpu;c906fdv" ctest --output-on-failure -j 4 - - linux-gcc-riscv64-rvv: - runs-on: [self-hosted, linux, centos] - steps: - - uses: actions/checkout@v4 - - #- name: cache-qemu - #id: cache-qemu - #uses: actions/cache@v4 - #with: - #path: qemu-install - #key: qemu-riscv64-install-20220502-3 - #- name: install-qemu-build-deps - #if: steps.cache-qemu.outputs.cache-hit != 'true' - #run: | - #sudo apt-get update - #sudo apt-get install autoconf automake autotools-dev ninja-build - #- name: checkout-qemu - #if: steps.cache-qemu.outputs.cache-hit != 'true' - #uses: actions/checkout@v4 - #with: - #repository: qemu/qemu - #path: qemu - #ref: f5643914a9e8f79c606a76e6a9d7ea82a3fc3e65 - #- name: qemu - #if: steps.cache-qemu.outputs.cache-hit != 'true' - #run: | - #cd qemu - #wget https://raw.githubusercontent.com/nihui/ncnn-assets/master/qemu-patches/0007-linux-user-Expose-risc-v-V-isa-bit-in-get_elf_hwcap.patch - #patch -p1 -i 0007-linux-user-Expose-risc-v-V-isa-bit-in-get_elf_hwcap.patch - #./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=riscv64-linux-user --disable-system - #make -j2 - #make install - - #- name: cache-riscv - #id: cache-riscv - #uses: actions/cache@v4 - #with: - #path: rv64gcv-install-next - #key: rv64gcv-linux-install-20210504 - - #- name: install-riscv-build-deps - #if: steps.cache-riscv.outputs.cache-hit != 'true' - #run: | - #sudo apt-get update - #sudo apt-get install autoconf automake autotools-dev curl python3 libmpc-dev libmpfr-dev libgmp-dev gawk build-essential bison flex texinfo gperf libtool patchutils bc zlib1g-dev libexpat-dev device-tree-compiler - - #- name: checkout-riscv-gnu-toolchain - #if: steps.cache-riscv.outputs.cache-hit != 'true' - #uses: actions/checkout@v4 - #with: - #repository: riscv/riscv-gnu-toolchain - #path: riscv-gnu-toolchain - #ref: da01ba455ce3802ffa84fdca3a089079996dbfc3 - #- name: checkout-riscv-gnu-toolchain-submodules - #if: steps.cache-riscv.outputs.cache-hit != 'true' - #run: | - #cd riscv-gnu-toolchain - #git submodule update --init --recursive --depth 1 glibc - #git submodule update --init --recursive --depth 1 newlib - #git submodule update --init --recursive --depth 1 riscv-binutils - #git submodule update --init --recursive --depth 1 riscv-gcc - #git submodule update --init --recursive --depth 1 riscv-dejagnu - #git submodule update --init --recursive --depth 1 riscv-gdb - #- name: riscv-gnu-toolchain - #if: steps.cache-riscv.outputs.cache-hit != 'true' - #run: | - #cd riscv-gnu-toolchain - #sed -i '/__OBSOLETE_MATH/d' newlib/newlib/libm/common/math_errf.c - #./configure --prefix=$GITHUB_WORKSPACE/rv64gcv-install-next --with-arch=rv64gcv_zfh - #make linux - - #- name: riscv-strip-install - #if: steps.cache-riscv.outputs.cache-hit != 'true' - #run: find $GITHUB_WORKSPACE/rv64gcv-install-next -type f | xargs -i strip -g {} || true - - - name: configure - run: export RISCV_ROOT_PATH=/data/action/osd/rv64gcv-install-next && mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/riscv64-unknown-linux-gnu.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. - - name: build - run: cmake --build build -j 4 - - - name: test-vlen256 - run: | - export PATH=/data/action/osd/qemu-install/bin:$PATH - cd build - TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-cpu;rv64,v=true,Zfh=true,vlen=256,elen=64,vext_spec=v1.0;-L;/data/action/osd/rv64gcv-install-next/sysroot" ctest --output-on-failure -j 4 - - - name: test-vlen128 - run: | - export PATH=/data/action/osd/qemu-install/bin:$PATH - cd build - TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-cpu;rv64,v=true,Zfh=true,vlen=128,elen=64,vext_spec=v1.0;-L;/data/action/osd/rv64gcv-install-next/sysroot" ctest --output-on-failure -j 4 diff --git a/.github/workflows/linux-riscv64-cpu-gnu-clang.yml b/.github/workflows/linux-riscv64-cpu-gnu-clang.yml deleted file mode 100644 index 45aa5e35a44..00000000000 --- a/.github/workflows/linux-riscv64-cpu-gnu-clang.yml +++ /dev/null @@ -1,142 +0,0 @@ -name: linux-riscv64-cpu-gnu-clang -on: - push: - branches: [master] - paths: - - '.github/workflows/linux-riscv64-cpu-gnu-clang.yml' - - 'toolchains/riscv64-unknown-linux-gnu.llvm-toolchain.cmake' - - 'CMakeLists.txt' - - 'cmake/**' - - 'src/*' - - 'src/layer/*' - - 'src/layer/riscv/**' - - 'tests/**' - pull_request: - branches: [master] - paths: - - '.github/workflows/linux-riscv64-cpu-gnu-clang.yml' - - 'toolchains/riscv64-unknown-linux-gnu.llvm-toolchain.cmake' - - 'CMakeLists.txt' - - 'cmake/**' - - 'src/*' - - 'src/layer/*' - - 'src/layer/riscv/**' - - 'tests/**' -concurrency: - group: linux-riscv64-cpu-gnu-clang-${{ github.ref }} - cancel-in-progress: true -permissions: - contents: read - -jobs: - linux-gcc-riscv64-rvv: - runs-on: [self-hosted, linux, centos] - steps: - - uses: actions/checkout@v4 - - #- name: cache-qemu - #id: cache-qemu - #uses: actions/cache@v3 - #with: - #path: qemu-install - #key: qemu-riscv64-install-20220502-3 - #- name: install-qemu-build-deps - #if: steps.cache-qemu.outputs.cache-hit != 'true' - #run: | - #sudo apt-get update - #sudo apt-get install autoconf automake autotools-dev ninja-build - #- name: checkout-qemu - #if: steps.cache-qemu.outputs.cache-hit != 'true' - #uses: actions/checkout@v4 - #with: - #repository: qemu/qemu - #path: qemu - #ref: f5643914a9e8f79c606a76e6a9d7ea82a3fc3e65 - #- name: qemu - #if: steps.cache-qemu.outputs.cache-hit != 'true' - #run: | - #cd qemu - #wget https://raw.githubusercontent.com/nihui/ncnn-assets/master/qemu-patches/0007-linux-user-Expose-risc-v-V-isa-bit-in-get_elf_hwcap.patch - #patch -p1 -i 0007-linux-user-Expose-risc-v-V-isa-bit-in-get_elf_hwcap.patch - #./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=riscv64-linux-user --disable-system - #make -j2 - #make install - - #- name: cache-riscv - #id: cache-riscv - #uses: actions/cache@v3 - #with: - #path: rv64gcv-install-next - #key: rv64gcv-linux-install-20210504 - - #- name: install-riscv-build-deps - #if: steps.cache-riscv.outputs.cache-hit != 'true' - #run: | - #sudo apt-get update - #sudo apt-get install autoconf automake autotools-dev curl python3 libmpc-dev libmpfr-dev libgmp-dev gawk build-essential bison flex texinfo gperf libtool patchutils bc zlib1g-dev libexpat-dev device-tree-compiler - - #- name: checkout-riscv-gnu-toolchain - #if: steps.cache-riscv.outputs.cache-hit != 'true' - #uses: actions/checkout@v4 - #with: - #repository: riscv/riscv-gnu-toolchain - #path: riscv-gnu-toolchain - #ref: da01ba455ce3802ffa84fdca3a089079996dbfc3 - #- name: checkout-riscv-gnu-toolchain-submodules - #if: steps.cache-riscv.outputs.cache-hit != 'true' - #run: | - #cd riscv-gnu-toolchain - #git submodule update --init --recursive --depth 1 glibc - #git submodule update --init --recursive --depth 1 newlib - #git submodule update --init --recursive --depth 1 riscv-binutils - #git submodule update --init --recursive --depth 1 riscv-gcc - #git submodule update --init --recursive --depth 1 riscv-dejagnu - #git submodule update --init --recursive --depth 1 riscv-gdb - #- name: riscv-gnu-toolchain - #if: steps.cache-riscv.outputs.cache-hit != 'true' - #run: | - #cd riscv-gnu-toolchain - #sed -i '/__OBSOLETE_MATH/d' newlib/newlib/libm/common/math_errf.c - #./configure --prefix=$GITHUB_WORKSPACE/rv64gcv-install-next --with-arch=rv64gcv_zfh - #make linux - - #- name: riscv-strip-install - #if: steps.cache-riscv.outputs.cache-hit != 'true' - #run: find $GITHUB_WORKSPACE/rv64gcv-install-next -type f | xargs -i strip -g {} || true - - # - name: install-clang - # run: | - # wget https://github.com/llvm/llvm-project/releases/download/llvmorg-15.0.1/llvm-project-15.0.1.src.tar.xz - # tar -xf llvm-project-15.0.1.src.tar.xz - # cd llvm-project-15.0.1.src - # mkdir build - # cd build - # cmake -DCMAKE_INSTALL_PREFIX=install -DCMAKE_BUILD_TYPE=Release -DBUILD_SHARED_LIBS=ON -DLLVM_ENABLE_PROJECTS="clang" -DLLVM_TARGETS_TO_BUILD="RISCV" -DLLVM_INCLUDE_EXAMPLES=OFF -DLLVM_INCLUDE_TESTS=OFF ../llvm/ - # make -j16 - # make install - - - name: build - env: - LD_LIBRARY_PATH: /data/action/install/lib64 - run: | - export RISCV_ROOT_PATH=/data/action/osd/rv64gcv-install-next - export PATH=/data/action/osd/llvm-project-15.0.1.src/build/install/bin:$PATH - mkdir build && cd build - cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/riscv64-unknown-linux-gnu.llvm-toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. - cmake --build . -j 4 - - - name: test-vlen256 - env: - LD_LIBRARY_PATH: /data/action/install/lib64 - run: | - export PATH=/data/action/osd/qemu-install/bin:$PATH - cd build - TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-cpu;rv64,v=true,Zfh=true,vlen=256,elen=64,vext_spec=v1.0;-L;/data/action/osd/rv64gcv-install-next/sysroot" ctest --output-on-failure -j 4 - - - name: test-vlen128 - env: - LD_LIBRARY_PATH: /data/action/install/lib64 - run: | - export PATH=/data/action/osd/qemu-install/bin:$PATH - cd build - TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-cpu;rv64,v=true,Zfh=true,vlen=128,elen=64,vext_spec=v1.0;-L;/data/action/osd/rv64gcv-install-next/sysroot" ctest --output-on-failure -j 4 diff --git a/.github/workflows/linux-riscv64.yml b/.github/workflows/linux-riscv64.yml new file mode 100644 index 00000000000..d7a47002945 --- /dev/null +++ b/.github/workflows/linux-riscv64.yml @@ -0,0 +1,312 @@ +name: linux-riscv64 +on: + push: + branches: [master] + paths: + - '.github/workflows/linux-riscv64.yml' + - 'toolchains/riscv64-linux-gnu.toolchain.cmake' + - 'toolchains/riscv64-unknown-linux-gnu.toolchain.cmake' + - 'toolchains/riscv64-unknown-linux-gnu.llvm-toolchain.cmake' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/*' + - 'src/layer/*' + - 'src/layer/riscv/**' + - 'tests/**' + pull_request: + branches: [master] + paths: + - '.github/workflows/linux-riscv64.yml' + - 'toolchains/riscv64-linux-gnu.toolchain.cmake' + - 'toolchains/riscv64-unknown-linux-gnu.toolchain.cmake' + - 'toolchains/riscv64-unknown-linux-gnu.llvm-toolchain.cmake' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/*' + - 'src/layer/*' + - 'src/layer/riscv/**' + - 'tests/**' +concurrency: + group: linux-riscv64-${{ github.ref }} + cancel-in-progress: true +permissions: + contents: read + +jobs: + gcc-riscv64: + runs-on: ubuntu-20.04 + steps: + - uses: actions/checkout@v4 + + - name: cache-qemu + id: cache-qemu + uses: actions/cache@v4 + with: + path: qemu-install + key: qemu-riscv64-install-20220502-4 + - name: install-qemu-build-deps + if: steps.cache-qemu.outputs.cache-hit != 'true' + run: | + sudo apt-get update + sudo apt-get install autoconf automake autotools-dev ninja-build + - name: checkout-qemu + if: steps.cache-qemu.outputs.cache-hit != 'true' + uses: actions/checkout@v4 + with: + repository: qemu/qemu + path: qemu + ref: f5643914a9e8f79c606a76e6a9d7ea82a3fc3e65 + - name: qemu + if: steps.cache-qemu.outputs.cache-hit != 'true' + run: | + cd qemu + wget https://raw.githubusercontent.com/nihui/ncnn-assets/master/qemu-patches/0007-linux-user-Expose-risc-v-V-isa-bit-in-get_elf_hwcap.patch + patch -p1 -i 0007-linux-user-Expose-risc-v-V-isa-bit-in-get_elf_hwcap.patch + ./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=riscv64-linux-user --disable-system + make -j$(nproc) + make install + + - name: riscv64-gnu-toolchain + run: | + sudo apt-get update + sudo apt-get install g++-riscv64-linux-gnu + + - name: configure + run: mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/riscv64-linux-gnu.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. + - name: build + run: cmake --build build -j $(nproc) + + - name: test + run: | + export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH + cd build + TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/riscv64-linux-gnu" ctest --output-on-failure -j $(nproc) + + c906: + runs-on: [self-hosted, linux, ubuntu] + steps: + - uses: actions/checkout@v4 + + - name: configure + run: | + export RISCV_ROOT_PATH=/data/action/osd/Xuantie-900-gcc-linux-6.6.0-glibc-x86_64-V3.0.1 + mkdir build && cd build + cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/c906-v301.toolchain.cmake -DCMAKE_BUILD_TYPE=release -DNCNN_OPENMP=OFF -DNCNN_THREADS=OFF -DNCNN_RUNTIME_CPU=OFF -DNCNN_RVV=OFF -DNCNN_XTHEADVECTOR=ON -DNCNN_ZFH=ON -DNCNN_SIMPLEOCV=ON -DNCNN_BUILD_EXAMPLES=ON -DNCNN_BUILD_TESTS=ON .. + - name: build + run: cmake --build build -j 8 + + - name: test + run: | + export PATH=/data/action/osd/Xuantie-qemu-x86_64-Ubuntu-20.04-V5.0.4-B20241127-1130/bin:$PATH + cd build + TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-cpu;c906fdv" ctest --output-on-failure -j 8 + + c910: + runs-on: [self-hosted, linux, ubuntu] + steps: + - uses: actions/checkout@v4 + + - name: configure + run: | + export RISCV_ROOT_PATH=/data/action/osd/Xuantie-900-gcc-linux-6.6.0-glibc-x86_64-V3.0.1 + mkdir build && cd build + cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/c910-v301.toolchain.cmake -DCMAKE_BUILD_TYPE=release -DNCNN_OPENMP=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_RVV=OFF -DNCNN_XTHEADVECTOR=ON -DNCNN_ZFH=ON -DNCNN_SIMPLEOCV=ON -DNCNN_BUILD_EXAMPLES=ON -DNCNN_BUILD_TESTS=ON .. + - name: build + run: cmake --build build -j 8 + + - name: test + run: | + export PATH=/data/action/osd/Xuantie-qemu-x86_64-Ubuntu-20.04-V5.0.4-B20241127-1130/bin:$PATH + cd build + TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-cpu;c910v" ctest --output-on-failure -j 8 + + gcc-rvv: + runs-on: [self-hosted, linux, ubuntu] + steps: + - uses: actions/checkout@v4 + + #- name: cache-qemu + #id: cache-qemu + #uses: actions/cache@v4 + #with: + #path: qemu-install + #key: qemu-riscv64-install-20241202 + #- name: install-qemu-build-deps + #if: steps.cache-qemu.outputs.cache-hit != 'true' + #run: | + #sudo apt-get update + #sudo apt-get install autoconf automake autotools-dev ninja-build + #- name: checkout-qemu + #if: steps.cache-qemu.outputs.cache-hit != 'true' + #uses: actions/checkout@v4 + #with: + #repository: qemu/qemu + #path: qemu + #ref: 72b88908d12ee9347d13539c7dd9a252625158d1 + #- name: qemu + #if: steps.cache-qemu.outputs.cache-hit != 'true' + #run: | + #cd qemu + #./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=riscv64-linux-user --disable-system + #make -j4 + #make install + + #- name: cache-riscv + #id: cache-riscv + #uses: actions/cache@v4 + #with: + #path: riscv-install + #key: riscv-linux-install-20241202 + + #- name: install-riscv-build-deps + #if: steps.cache-riscv.outputs.cache-hit != 'true' + #run: | + #sudo apt-get update + #sudo apt-get install autoconf automake autotools-dev curl python3 libmpc-dev libmpfr-dev libgmp-dev gawk build-essential bison flex texinfo gperf libtool patchutils bc zlib1g-dev libexpat-dev device-tree-compiler + + #- name: checkout-riscv-gnu-toolchain + #if: steps.cache-riscv.outputs.cache-hit != 'true' + #uses: actions/checkout@v4 + #with: + #repository: riscv-collab/riscv-gnu-toolchain + #path: riscv-gnu-toolchain + #ref: 20f615317e2ce888dfc11b29ccde4a649494b654 + #- name: checkout-riscv-gnu-toolchain-submodules + #if: steps.cache-riscv.outputs.cache-hit != 'true' + #run: | + #cd riscv-gnu-toolchain + #git submodule update --init --recursive --depth 1 glibc + #git submodule update --init --recursive --depth 1 newlib + #git submodule update --init --recursive --depth 1 riscv-binutils + #git submodule update --init --recursive --depth 1 riscv-gcc + #git submodule update --init --recursive --depth 1 riscv-dejagnu + #git submodule update --init --recursive --depth 1 riscv-gdb + #- name: riscv-gnu-toolchain + #if: steps.cache-riscv.outputs.cache-hit != 'true' + #run: | + #cd riscv-gnu-toolchain + #./configure --prefix=$GITHUB_WORKSPACE/riscv + #make linux -j4 + + #- name: riscv-strip-install + #if: steps.cache-riscv.outputs.cache-hit != 'true' + #run: find $GITHUB_WORKSPACE/riscv -type f | xargs -i strip -g {} || true + + - name: configure + run: export RISCV_ROOT_PATH=/data/action/osd/riscv && mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/riscv64-unknown-linux-gnu.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. + - name: build + run: cmake --build build -j 8 + + - name: test-vlen256 + run: | + export PATH=/data/action/osd/qemu-install/bin:$PATH + cd build + TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-cpu;rv64,v=true,zfh=true,zvfh=true,vlen=256,elen=64,vext_spec=v1.0;-L;/data/action/osd/riscv/sysroot" ctest --output-on-failure -j 8 + + - name: test-vlen128 + run: | + export PATH=/data/action/osd/qemu-install/bin:$PATH + cd build + TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-cpu;rv64,v=true,zfh=true,zvfh=true,vlen=128,elen=64,vext_spec=v1.0;-L;/data/action/osd/riscv/sysroot" ctest --output-on-failure -j 8 + + clang-rvv: + runs-on: [self-hosted, linux, ubuntu] + steps: + - uses: actions/checkout@v4 + + #- name: cache-qemu + #id: cache-qemu + #uses: actions/cache@v4 + #with: + #path: qemu-install + #key: qemu-riscv64-install-20241202 + #- name: install-qemu-build-deps + #if: steps.cache-qemu.outputs.cache-hit != 'true' + #run: | + #sudo apt-get update + #sudo apt-get install autoconf automake autotools-dev ninja-build + #- name: checkout-qemu + #if: steps.cache-qemu.outputs.cache-hit != 'true' + #uses: actions/checkout@v4 + #with: + #repository: qemu/qemu + #path: qemu + #ref: 72b88908d12ee9347d13539c7dd9a252625158d1 + #- name: qemu + #if: steps.cache-qemu.outputs.cache-hit != 'true' + #run: | + #cd qemu + #./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=riscv64-linux-user --disable-system + #make -j4 + #make install + + #- name: cache-riscv + #id: cache-riscv + #uses: actions/cache@v4 + #with: + #path: riscv-install + #key: riscv-linux-install-20241202 + + #- name: install-riscv-build-deps + #if: steps.cache-riscv.outputs.cache-hit != 'true' + #run: | + #sudo apt-get update + #sudo apt-get install autoconf automake autotools-dev curl python3 libmpc-dev libmpfr-dev libgmp-dev gawk build-essential bison flex texinfo gperf libtool patchutils bc zlib1g-dev libexpat-dev device-tree-compiler + + #- name: checkout-riscv-gnu-toolchain + #if: steps.cache-riscv.outputs.cache-hit != 'true' + #uses: actions/checkout@v4 + #with: + #repository: riscv-collab/riscv-gnu-toolchain + #path: riscv-gnu-toolchain + #ref: 20f615317e2ce888dfc11b29ccde4a649494b654 + #- name: checkout-riscv-gnu-toolchain-submodules + #if: steps.cache-riscv.outputs.cache-hit != 'true' + #run: | + #cd riscv-gnu-toolchain + #git submodule update --init --recursive --depth 1 glibc + #git submodule update --init --recursive --depth 1 newlib + #git submodule update --init --recursive --depth 1 riscv-binutils + #git submodule update --init --recursive --depth 1 riscv-gcc + #git submodule update --init --recursive --depth 1 riscv-dejagnu + #git submodule update --init --recursive --depth 1 riscv-gdb + #- name: riscv-gnu-toolchain + #if: steps.cache-riscv.outputs.cache-hit != 'true' + #run: | + #cd riscv-gnu-toolchain + #./configure --prefix=$GITHUB_WORKSPACE/riscv + #make linux -j4 + + #- name: riscv-strip-install + #if: steps.cache-riscv.outputs.cache-hit != 'true' + #run: find $GITHUB_WORKSPACE/riscv -type f | xargs -i strip -g {} || true + + # - name: install-clang + # run: | + # wget https://github.com/llvm/llvm-project/releases/download/llvmorg-19.1.4/llvm-project-19.1.4.src.tar.xz + # tar -xf llvm-project-19.1.4.src.tar.xz + # cd llvm-project-19.1.4.src + # mkdir build + # cd build + # cmake -DCMAKE_INSTALL_PREFIX=$GITHUB_WORKSPACE/riscv -DCMAKE_BUILD_TYPE=Release -DBUILD_SHARED_LIBS=ON -DLLVM_ENABLE_PROJECTS="clang" -DLLVM_TARGETS_TO_BUILD="RISCV" -DLLVM_INCLUDE_EXAMPLES=OFF -DLLVM_INCLUDE_TESTS=OFF ../llvm/ + # make -j16 + # make install + + - name: build + run: | + export RISCV_ROOT_PATH=/data/action/osd/riscv + mkdir build && cd build + cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/riscv64-unknown-linux-gnu.llvm-toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. + cmake --build . -j 8 + + - name: test-vlen256 + run: | + export PATH=/data/action/osd/qemu-install/bin:$PATH + cd build + TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-cpu;rv64,v=true,zfh=true,zvfh=true,vlen=256,elen=64,vext_spec=v1.0;-L;/data/action/osd/riscv/sysroot" ctest --output-on-failure -j 8 + + - name: test-vlen128 + run: | + export PATH=/data/action/osd/qemu-install/bin:$PATH + cd build + TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-cpu;rv64,v=true,zfh=true,zvfh=true,vlen=128,elen=64,vext_spec=v1.0;-L;/data/action/osd/riscv/sysroot" ctest --output-on-failure -j 8 diff --git a/.github/workflows/test-coverage.yml b/.github/workflows/test-coverage.yml index 6bc398f111f..e846f2fd846 100644 --- a/.github/workflows/test-coverage.yml +++ b/.github/workflows/test-coverage.yml @@ -59,51 +59,13 @@ jobs: uses: codecov/codecov-action@v5 with: token: ${{ secrets.CODECOV_TOKEN }} - file: build/lcov.info - - name: codecov-vlen256-retry-1 - continue-on-error: true - id: codecov-vlen256-retry-1 - if: steps.codecov.outcome=='failure' - uses: codecov/codecov-action@v5 - with: - token: ${{ secrets.CODECOV_TOKEN }} - file: build/lcov.info - - name: codecov-vlen256-retry-2 - continue-on-error: true - id: codecov-vlen256-retry-2 - if: steps.codecov-vlen256-retry-1.outcome=='failure' - uses: codecov/codecov-action@v5 - with: - token: ${{ secrets.CODECOV_TOKEN }} - file: build/lcov.info - - name: codecov-vlen256-retry-3 - continue-on-error: true - id: codecov-vlen256-retry-3 - if: steps.codecov-vlen256-retry-2.outcome=='failure' - uses: codecov/codecov-action@v5 - with: - token: ${{ secrets.CODECOV_TOKEN }} - file: build/lcov.info - - name: codecov-vlen256-retry-4 - continue-on-error: true - id: codecov-vlen256-retry-4 - if: steps.codecov-vlen256-retry-3.outcome=='failure' - uses: codecov/codecov-action@v5 - with: - token: ${{ secrets.CODECOV_TOKEN }} - file: build/lcov.info - - name: codecov-vlen256-retry-5 - continue-on-error: true - id: codecov-vlen256-retry-5 - if: steps.codecov-vlen256-retry-4.outcome=='failure' - uses: codecov/codecov-action@v5 - with: - token: ${{ secrets.CODECOV_TOKEN }} - file: build/lcov.info + disable_search: true + plugins: noop + files: build/lcov.info - name: set the status if: always() run: | - if ${{ steps.codecov.outcome=='success' || steps.codecov-vlen256-retry-1.outcome=='success' || steps.codecov-vlen256-retry-2.outcome=='success' || steps.codecov-vlen256-retry-3.outcome=='success' || steps.codecov-vlen256-retry-4.outcome=='success' || steps.codecov-vlen256-retry-5.outcome=='success' }}; then + if ${{ steps.codecov.outcome=='success' }}; then echo fine else exit 1 @@ -144,4 +106,50 @@ jobs: uses: codecov/codecov-action@v5 with: token: ${{ secrets.CODECOV_TOKEN }} - file: build-avx512-spr/lcov.info + disable_search: true + plugins: noop + files: build-avx512-spr/lcov.info + + linux-gcc-riscv64-rvv: + strategy: + matrix: + openmp: [ON, OFF] + runs-on: [self-hosted, linux, ubuntu] + steps: + - uses: actions/checkout@v4 + - name: build + run: | + export RISCV_ROOT_PATH=/data/action/osd/riscv + mkdir build + cd build + cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/riscv64-unknown-linux-gnu.toolchain.cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_RVV=ON -DNCNN_ZFH=ON -DNCNN_ZVFH=ON -DNCNN_OPENMP=${{ matrix.openmp }} -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. + cmake --build . -j 8 + + - name: test-vlen256 + run: | + export PATH=/data/action/osd/qemu-install/bin:$PATH + cd build + TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-cpu;rv64,v=true,zfh=true,zvfh=true,vlen=256,elen=64,vext_spec=v1.0;-L;/data/action/osd/riscv/sysroot" ctest --output-on-failure -j 8 + + - name: test-vlen128 + run: | + export PATH=/data/action/osd/qemu-install/bin:$PATH + cd build + TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-cpu;rv64,v=true,zfh=true,zvfh=true,vlen=128,elen=64,vext_spec=v1.0;-L;/data/action/osd/riscv/sysroot" ctest --output-on-failure -j 8 + + - name: lcov-collect + run: | + cd build + lcov --gcov-tool /data/action/osd/riscv/bin/riscv64-unknown-linux-gnu-gcov -d ./src -c -o lcov.info + lcov -r lcov.info '/usr/*' -o lcov.info + lcov -r lcov.info '*/install/*' -o lcov.info + lcov -r lcov.info '*/build/*' -o lcov.info + lcov --list lcov.info + + - name: codecov + uses: codecov/codecov-action@v5 + with: + token: ${{ secrets.CODECOV_TOKEN }} + disable_search: true + plugins: noop + files: build/lcov.info diff --git a/CMakeLists.txt b/CMakeLists.txt index bf0e9f20fb8..473440cc454 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -420,40 +420,44 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(riscv)") if(CMAKE_SIZEOF_VOID_P EQUAL 8) set(CMAKE_REQUIRED_FLAGS "-march=rv64gcv") - check_cxx_source_compiles("#include \nint main() { vfloat32m1_t _s, _w; float _v; size_t vl; _s = vfmacc_vf_f32m1(_s, _v, _w, vl); return 0; }" NCNN_COMPILER_SUPPORT_RVV) + check_cxx_source_compiles("#include \nint main() { vfloat32m8_t _s, _w; float _v; size_t vl; _s = __riscv_vfmacc_vf_f32m8(_s, _v, _w, vl); vfloat32m1_t _x; vfloat32m1x2_t _xx = __riscv_vcreate_v_f32m1x2(_x, _x); return 0; }" NCNN_COMPILER_SUPPORT_RISCV_V) - set(CMAKE_REQUIRED_FLAGS "-march=rv64gcv_zfh") - check_cxx_source_compiles("#include \nint main() { vfloat16m1_t _s, _w; __fp16 _v; size_t vl; _s = vfmacc_vf_f16m1(_s, _v, _w, vl); return 0; }" NCNN_COMPILER_SUPPORT_RVV_ZFH) + set(CMAKE_REQUIRED_FLAGS "-march=rv64gc_zfh -D__fp16=_Float16") + check_cxx_source_compiles("int main() { __fp16 s, v; s = v * v; return 0; }" NCNN_COMPILER_SUPPORT_RISCV_ZFH) - if(NOT NCNN_COMPILER_SUPPORT_RVV_ZFH) - set(CMAKE_REQUIRED_FLAGS "-march=rv64gcv_zfh_zvfh0p1 -menable-experimental-extensions -D__fp16=_Float16") - check_cxx_source_compiles("#include \nint main() { vfloat16m1_t _s, _w; __fp16 _v; size_t vl; _s = vfmacc_vf_f16m1(_s, _v, _w, vl); return 0; }" NCNN_COMPILER_SUPPORT_RVV_ZVFH) - endif() + set(CMAKE_REQUIRED_FLAGS "-march=rv64gcv_zfh_zvfh -D__fp16=_Float16") + check_cxx_source_compiles("#include \nint main() { vfloat16m8_t _s, _w; __fp16 _v; size_t vl; _s = __riscv_vfmacc_vf_f16m8(_s, _v, _w, vl); return 0; }" NCNN_COMPILER_SUPPORT_RISCV_ZVFH) + + set(CMAKE_REQUIRED_FLAGS "-march=rv64gc_zfh_xtheadvector -D__fp16=_Float16") + check_cxx_source_compiles("#include \nint main() { vfloat16m8_t _s, _w; __fp16 _v; size_t vl; _s = __riscv_vfmacc_vf_f16m8(_s, _v, _w, vl); vfloat32m1_t _x; vfloat32m1x2_t _xx = __riscv_vcreate_v_f32m1x2(_x, _x); return 0; }" NCNN_COMPILER_SUPPORT_RISCV_XTHEADVECTOR) unset(CMAKE_REQUIRED_FLAGS) - if(NCNN_COMPILER_SUPPORT_RVV) + if(NCNN_COMPILER_SUPPORT_RISCV_V OR NCNN_COMPILER_SUPPORT_RISCV_XTHEADVECTOR) option(NCNN_RVV "optimize risc-v platform with v extension" ON) - option(NCNN_RVV_CHECK_VFREDSUM "check compilter about support rvv-intrinsic" ON) - if(NCNN_RVV_CHECK_VFREDSUM) - include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/ncnn_check_rvv_vfredusum.cmake) - endif() - if(NOT (NCNN_COMPILER_SUPPORT_RVV_ZFH OR NCNN_COMPILER_SUPPORT_RVV_ZVFH)) - message(WARNING "The compiler does not support risc-v zfh extension. Upgrading your toolchain is strongly recommended.") - endif() - option(NCNN_RVV_CHECK_PLAIN_SEGMENT "check compilter about rvv segment load/store interface" ON) - if(NCNN_RVV_CHECK_PLAIN_SEGMENT) - set(CMAKE_REQUIRED_FLAGS "-march=rv64gcv") - check_cxx_source_compiles("#include \nint main() { vfloat32m1_t _s, _w; size_t vl; float src[32]={.0f}; vlseg2e32_v_f32m1(&_s, &_w, src, vl); return 0; }" NCNN_COMPILER_USE_RVV_PLAIN_SEG) - unset(CMAKE_REQUIRED_FLAGS) - endif() - if(NOT NCNN_COMPILER_USE_RVV_PLAIN_SEG) - message(WARNING "The compiler uses tuple types for segment load/store. Upgrading your toolchain is strongly recommended.") - add_definitions(-D__rvv_tuple) + else() + message(WARNING "The compiler does not support risc-v v or xtheadvector extension. NCNN_RVV will be OFF.") + endif() + + if(NCNN_COMPILER_SUPPORT_RISCV_XTHEADVECTOR) + option(NCNN_XTHEADVECTOR "optimize risc-v platform with xtheadvector extension" ON) + else() + message(WARNING "The compiler does not support risc-v xtheadvector extension. NCNN_XTHEADVECTOR will be OFF.") + endif() + + if(NCNN_COMPILER_SUPPORT_RISCV_ZFH) + option(NCNN_ZFH "optimize risc-v platform with zfh extension" ON) + if(NCNN_COMPILER_SUPPORT_RISCV_ZVFH OR NCNN_COMPILER_SUPPORT_RISCV_XTHEADVECTOR) + if(NCNN_RVV AND NCNN_ZFH) + option(NCNN_ZVFH "optimize risc-v platform with zvfh extension" ON) + endif() + else() + message(WARNING "The compiler does not support zvfh extension. NCNN_ZVFH will be OFF.") endif() else() - message(WARNING "The compiler does not support risc-v v extension. NCNN_RVV will be OFF.") + message(WARNING "The compiler does not support risc-v zfh extension. NCNN_ZFH will be OFF.") endif() + endif() elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)") set(NCNN_TARGET_ARCH powerpc) diff --git a/build-android.cmd b/build-android.cmd index b621dae6c1a..fd1a83700f4 100644 --- a/build-android.cmd +++ b/build-android.cmd @@ -1,7 +1,7 @@ :: Set android ndk root @ECHO OFF @SETLOCAL -@SET ANDROID_NDK= +@SET ANDROID_NDK= :: Set ninja.exe :: @SET NINJA_EXE= @@ -38,4 +38,12 @@ cmake --build . --parallel %NUMBER_OF_PROCESSORS% cmake --build . --target install popd +:: android riscv64 +mkdir build-android-riscv64 +pushd build-android-riscv64 +cmake -G "Unix Makefiles" -DCMAKE_TOOLCHAIN_FILE=%ANDROID_NDK%/build/cmake/android.toolchain.cmake -DCMAKE_MAKE_PROGRAM="%ANDROID_NDK%/prebuilt/windows-x86_64/bin/make.exe" -DANDROID_ABI="riscv64" -DANDROID_PLATFORM=android-35 -DNCNN_VULKAN=ON .. +cmake --build . --parallel %NUMBER_OF_PROCESSORS% +cmake --build . --target install +popd + @ENDLOCAL diff --git a/build.sh b/build.sh index 754aaf8a4cd..4b03cec34b3 100755 --- a/build.sh +++ b/build.sh @@ -40,6 +40,14 @@ make -j4 make install popd +##### android riscv64 +mkdir -p build-android-riscv64 +pushd build-android-riscv64 +cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI="riscv64" -DANDROID_PLATFORM=android-35 -DNCNN_VULKAN=ON .. +make -j4 +make install +popd + ##### linux of hisiv300 (forgot the chip name) toolchain with neon and openmp mkdir -p build-hisiv300-linux pushd build-hisiv300-linux diff --git a/cmake/ncnn_add_layer.cmake b/cmake/ncnn_add_layer.cmake index 50d7dcf89ce..cbc1b09e0b1 100644 --- a/cmake/ncnn_add_layer.cmake +++ b/cmake/ncnn_add_layer.cmake @@ -54,6 +54,28 @@ macro(ncnn_add_arch_opt_source class NCNN_TARGET_ARCH_OPT NCNN_TARGET_ARCH_OPT_C endif() endmacro() +macro(ncnn_add_arch_opt_layer_source class NCNN_TARGET_ARCH_OPT_BASE NCNN_TARGET_ARCH_OPT NCNN_TARGET_ARCH_OPT_CFLAGS) + set(NCNN_${NCNN_TARGET_ARCH_OPT_BASE}_SOURCE ${CMAKE_CURRENT_SOURCE_DIR}/layer/${NCNN_TARGET_ARCH}/${name}_${NCNN_TARGET_ARCH}_${NCNN_TARGET_ARCH_OPT_BASE}.cpp) + + if(WITH_LAYER_${name} AND EXISTS ${NCNN_${NCNN_TARGET_ARCH_OPT_BASE}_SOURCE}) + + set(NCNN_${NCNN_TARGET_ARCH_OPT_BASE}_${NCNN_TARGET_ARCH_OPT}_SOURCE ${CMAKE_CURRENT_BINARY_DIR}/layer/${NCNN_TARGET_ARCH}/${name}_${NCNN_TARGET_ARCH}_${NCNN_TARGET_ARCH_OPT_BASE}_${NCNN_TARGET_ARCH_OPT}.cpp) + + add_custom_command( + OUTPUT ${NCNN_${NCNN_TARGET_ARCH_OPT_BASE}_${NCNN_TARGET_ARCH_OPT}_SOURCE} + COMMAND ${CMAKE_COMMAND} -DSRC=${NCNN_${NCNN_TARGET_ARCH_OPT_BASE}_SOURCE} -DDST=${NCNN_${NCNN_TARGET_ARCH_OPT_BASE}_${NCNN_TARGET_ARCH_OPT}_SOURCE} -DCLASS=${class} -P "${CMAKE_CURRENT_SOURCE_DIR}/../cmake/ncnn_generate_${NCNN_TARGET_ARCH_OPT}_source.cmake" + DEPENDS ${NCNN_${NCNN_TARGET_ARCH_OPT_BASE}_SOURCE} + COMMENT "Generating source ${name}_${NCNN_TARGET_ARCH}_${NCNN_TARGET_ARCH_OPT_BASE}_${NCNN_TARGET_ARCH_OPT}.cpp" + VERBATIM + ) + set_source_files_properties(${NCNN_${NCNN_TARGET_ARCH_OPT_BASE}_${NCNN_TARGET_ARCH_OPT}_SOURCE} PROPERTIES GENERATED TRUE) + + set_source_files_properties(${NCNN_${NCNN_TARGET_ARCH_OPT_BASE}_${NCNN_TARGET_ARCH_OPT}_SOURCE} PROPERTIES COMPILE_FLAGS ${NCNN_TARGET_ARCH_OPT_CFLAGS}) + + list(APPEND ncnn_SRCS ${NCNN_${NCNN_TARGET_ARCH_OPT_BASE}_${NCNN_TARGET_ARCH_OPT}_SOURCE}) + endif() +endmacro() + macro(ncnn_add_layer class) string(TOLOWER ${class} name) @@ -392,14 +414,25 @@ macro(ncnn_add_layer class) if(NCNN_TARGET_ARCH STREQUAL "riscv" AND CMAKE_SIZEOF_VOID_P EQUAL 8) if(NCNN_RUNTIME_CPU AND NCNN_RVV) - if(NCNN_COMPILER_SUPPORT_RVV_ZFH) - ncnn_add_arch_opt_layer(${class} rvv "-march=rv64gcv_zfh") - elseif(NCNN_COMPILER_SUPPORT_RVV_ZVFH) - ncnn_add_arch_opt_layer(${class} rvv "-march=rv64gcv_zfh_zvfh0p1 -menable-experimental-extensions -D__fp16=_Float16") - elseif(NCNN_COMPILER_SUPPORT_RVV) - ncnn_add_arch_opt_layer(${class} rvv "-march=rv64gcv") + ncnn_add_arch_opt_layer(${class} rvv "-march=rv64gcv") + endif() + if(NCNN_ZFH) + if(NOT NCNN_RUNTIME_CPU AND NCNN_ZVFH) + ncnn_add_arch_opt_source(${class} zfh "-march=rv64gcv_zfh_zvfh -D__fp16=_Float16") + elseif(NOT NCNN_RUNTIME_CPU AND NCNN_XTHEADVECTOR) + ncnn_add_arch_opt_source(${class} zfh "-march=rv64gc_zfh_xtheadvector -D__riscv_zvfh=1 -D__fp16=_Float16") + else() + ncnn_add_arch_opt_source(${class} zfh "-march=rv64gc_zfh -D__fp16=_Float16") endif() endif() + if(NCNN_RUNTIME_CPU AND NCNN_XTHEADVECTOR) + # linker complains the conflict of v and xtheadvector, so disable generating any riscv attributes + ncnn_add_arch_opt_layer(${class} xtheadvector "-march=rv64gc_xtheadvector -mno-riscv-attribute -Wa,-mno-arch-attr") + ncnn_add_arch_opt_layer_source(${class} zfh xtheadvector "-march=rv64gc_zfh_xtheadvector -mno-riscv-attribute -Wa,-mno-arch-attr -D__fp16=_Float16") + endif() + if(NCNN_RUNTIME_CPU AND NCNN_ZVFH) + ncnn_add_arch_opt_layer_source(${class} zfh rvv "-march=rv64gcv_zfh_zvfh -D__fp16=_Float16") + endif() endif() # generate layer_type_enum file diff --git a/cmake/ncnn_check_rvv_vfredusum.cmake b/cmake/ncnn_check_rvv_vfredusum.cmake deleted file mode 100644 index 59065556356..00000000000 --- a/cmake/ncnn_check_rvv_vfredusum.cmake +++ /dev/null @@ -1,46 +0,0 @@ -include(CheckCXXSourceCompiles) - -set(TEMP_CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS}) -set(CMAKE_REQUIRED_FLAGS "-march=rv64gcv") - -check_cxx_source_compiles(" -#include -int main(void) -{ - float in1[4] = {-1.f,0.f,+1.f,2.f}; - float out1=0; - size_t vl = vsetvl_e32m8(4); - vfloat32m8_t _add = vle32_v_f32m8(in1,vl); - vfloat32m1_t _sum = vfmv_s_f_f32m1(vundefined_f32m1(),out1,vl); - _sum = vfredsum_vs_f32m8_f32m1(_sum, _add, _sum, vl); - out1 = vfmv_f_s_f32m1_f32(_sum); - return 0; -} -" NCNN_COMPILER_USE_VFREDSUM) -check_cxx_source_compiles(" -#include -int main(void) -{ - float in1[4] = {-1.f,0.f,+1.f,2.f}; - float out1=0; - size_t vl = vsetvl_e32m8(4); - vfloat32m8_t _add = vle32_v_f32m8(in1,vl); - vfloat32m1_t _sum = vfmv_s_f_f32m1(vundefined_f32m1(),out1,vl); - _sum = vfredusum_vs_f32m8_f32m1(_sum, _add, _sum, vl); - out1 = vfmv_f_s_f32m1_f32(_sum); - return 0; -}; -" NCNN_COMPILER_USE_VFREDUSUM) - -if(NCNN_COMPILER_USE_VFREDSUM AND NOT NCNN_COMPILER_USE_VFREDUSUM) - message(WARNING "The compiler uses vfredsum. Upgrading your toolchain is strongly recommended.") - foreach(LMUL 1 2 4 8) - add_definitions(-Dvfredusum_vs_f32m${LMUL}_f32m1=vfredsum_vs_f32m${LMUL}_f32m1) - if(NCNN_COMPILER_SUPPORT_RVV_ZFH OR NCNN_COMPILER_SUPPORT_RVV_ZVFH) - add_definitions(-Dvfredusum_vs_f16m${LMUL}_f16m1=vfredsum_vs_f16m${LMUL}_f16m1) - endif() - endforeach() -endif() - -set(CMAKE_REQUIRED_FLAGS ${TEMP_CMAKE_REQUIRED_FLAGS}) -unset(TEMP_CMAKE_REQUIRED_FLAGS) diff --git a/cmake/ncnn_generate_xtheadvector_source.cmake b/cmake/ncnn_generate_xtheadvector_source.cmake new file mode 100644 index 00000000000..e9c7b00c818 --- /dev/null +++ b/cmake/ncnn_generate_xtheadvector_source.cmake @@ -0,0 +1,14 @@ + +# must define SRC DST CLASS + +file(READ ${SRC} source_data) + +# replace +string(TOUPPER ${CLASS} CLASS_UPPER) +string(TOLOWER ${CLASS} CLASS_LOWER) + +string(REGEX REPLACE "LAYER_${CLASS_UPPER}_RISCV_H" "LAYER_${CLASS_UPPER}_RISCV_XTHEADVECTOR_H" source_data "${source_data}") +string(REGEX REPLACE "${CLASS}_riscv" "${CLASS}_riscv_xtheadvector" source_data "${source_data}") +string(REGEX REPLACE "#include \"${CLASS_LOWER}_riscv.h\"" "#include \"${CLASS_LOWER}_riscv_xtheadvector.h\"" source_data "${source_data}") + +file(WRITE ${DST} "${source_data}") diff --git a/docs/how-to-build/how-to-build.md b/docs/how-to-build/how-to-build.md index b423834c501..bb69aba7800 100644 --- a/docs/how-to-build/how-to-build.md +++ b/docs/how-to-build/how-to-build.md @@ -667,11 +667,11 @@ Pick `build-XYZ/install` folder for further usage. ### Build for AllWinner D1 -Download c906 toolchain package from https://xuantie.t-head.cn/community/download?id=4224193099938729984 +Download c906 toolchain package from https://www.xrvm.cn/community/download?id=4382928864901402624 ```shell -tar -xf Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.6.1-20220906.tar.gz -export RISCV_ROOT_PATH=/home/nihui/osd/Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.6.1 +tar -xf Xuantie-900-gcc-linux-6.6.0-glibc-x86_64-V3.0.1-20241120.tar.gz +export RISCV_ROOT_PATH=/home/nihui/osd/Xuantie-900-gcc-linux-6.6.0-glibc-x86_64-V3.0.1 ``` Build ncnn with riscv-v vector and simpleocv enabled: @@ -679,7 +679,7 @@ Build ncnn with riscv-v vector and simpleocv enabled: mkdir -p build-c906 cd build-c906 cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/c906-v226.toolchain.cmake \ - -DCMAKE_BUILD_TYPE=release -DNCNN_OPENMP=OFF -DNCNN_THREADS=OFF -DNCNN_RUNTIME_CPU=OFF -DNCNN_RVV=ON \ + -DCMAKE_BUILD_TYPE=release -DNCNN_OPENMP=OFF -DNCNN_THREADS=OFF -DNCNN_RUNTIME_CPU=OFF -DNCNN_RVV=OFF -DNCNN_XTHEADVECTOR=ON -DNCNN_ZFH=ON \ -DNCNN_SIMPLEOCV=ON -DNCNN_BUILD_EXAMPLES=ON .. cmake --build . -j 4 cmake --build . --target install diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index b27f8ad42bd..cd5f9b6728d 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -666,16 +666,24 @@ if(NCNN_TARGET_ARCH STREQUAL "loongarch") endif() endif() -if(NCNN_TARGET_ARCH STREQUAL "riscv" AND NOT C906) +if(NCNN_TARGET_ARCH STREQUAL "riscv" AND CMAKE_SIZEOF_VOID_P EQUAL 8 AND NOT C906) if(NOT NCNN_RUNTIME_CPU AND NCNN_RVV) - if(NCNN_COMPILER_SUPPORT_RVV_ZFH) - target_compile_options(ncnn PRIVATE -march=rv64gcv_zfh) - elseif(NCNN_COMPILER_SUPPORT_RVV_ZVFH) - target_compile_options(ncnn PRIVATE -march=rv64gcv_zfh_zvfh0p1 -menable-experimental-extensions -D__fp16=_Float16) - elseif(NCNN_COMPILER_SUPPORT_RVV) - target_compile_options(ncnn PRIVATE -march=rv64gcv) + set(RISCV_MARCH_FLAG "-march=rv64gcv") + if(NCNN_ZFH) + set(RISCV_MARCH_FLAG "${RISCV_MARCH_FLAG}_zfh") + target_compile_options(ncnn PRIVATE -D__fp16=_Float16) + endif() + if(NCNN_ZVFH) + set(RISCV_MARCH_FLAG "${RISCV_MARCH_FLAG}_zvfh") + endif() + elseif(NOT NCNN_RUNTIME_CPU AND NCNN_XTHEADVECTOR) + set(RISCV_MARCH_FLAG "-march=rv64gc_xtheadvector") + if(NCNN_ZFH) + set(RISCV_MARCH_FLAG "${RISCV_MARCH_FLAG}_zfh") + target_compile_options(ncnn PRIVATE -D__riscv_zvfh=1 -D__fp16=_Float16) endif() endif() + target_compile_options(ncnn PRIVATE ${RISCV_MARCH_FLAG}) endif() if(NCNN_PPC64LE_VSX) diff --git a/src/c_api.cpp b/src/c_api.cpp index f8146e054c2..2405598b4d9 100644 --- a/src/c_api.cpp +++ b/src/c_api.cpp @@ -782,7 +782,7 @@ ncnn_modelbin_t ncnn_modelbin_create_from_mat_array(const ncnn_mat_t* weights, i matarray[i] = *(const Mat*)weights[i]; } ncnn_modelbin_t mb = (ncnn_modelbin_t)malloc(sizeof(struct __ncnn_modelbin_t)); - mb->pthis = (void*)(new ModelBinFromMatArray_c_api(mb, &matarray[0])); + mb->pthis = (void*)(new ModelBinFromMatArray_c_api(mb, n ? &matarray[0] : NULL)); mb->load_1d = __ncnn_ModelBinFromMatArray_load_1d; mb->load_2d = __ncnn_ModelBinFromMatArray_load_2d; mb->load_3d = __ncnn_ModelBinFromMatArray_load_3d; diff --git a/src/cpu.cpp b/src/cpu.cpp index c9307619ce9..0c49eae50d4 100644 --- a/src/cpu.cpp +++ b/src/cpu.cpp @@ -2717,8 +2717,44 @@ int cpu_support_riscv_zfh() #endif } +int cpu_support_riscv_zvfh() +{ + try_initialize_global_cpu_info(); +#if defined __ANDROID__ || defined __linux__ +#if __riscv + // v + f does not imply zfh, but how to discover zvfh properly ? + // upstream issue https://github.com/riscv/riscv-isa-manual/issues/414 + return g_hwcaps & COMPAT_HWCAP_ISA_V && g_hwcaps & COMPAT_HWCAP_ISA_F; +#else + return 0; +#endif +#else + return 0; +#endif +} + +int cpu_support_riscv_xtheadvector() +{ + try_initialize_global_cpu_info(); +#if defined __ANDROID__ || defined __linux__ +#if __riscv + // v + f does not imply zfh, but how to discover zvfh properly ? + // upstream issue https://github.com/riscv/riscv-isa-manual/issues/414 + return g_hwcaps & COMPAT_HWCAP_ISA_V && g_hwcaps & COMPAT_HWCAP_ISA_F; +#else + return 0; +#endif +#else + return 0; +#endif +} + int cpu_riscv_vlenb() { +#if C906 + // FIXME xuantie qemu reports all zero auxv flags + return 16; +#endif try_initialize_global_cpu_info(); #if __riscv if (!cpu_support_riscv_v()) diff --git a/src/cpu.h b/src/cpu.h index f0e4728633f..6a3fcea293e 100644 --- a/src/cpu.h +++ b/src/cpu.h @@ -122,6 +122,10 @@ NCNN_EXPORT int cpu_support_loongson_mmi(); NCNN_EXPORT int cpu_support_riscv_v(); // zfh = riscv half-precision float NCNN_EXPORT int cpu_support_riscv_zfh(); +// zvfh = riscv vector half-precision float +NCNN_EXPORT int cpu_support_riscv_zvfh(); +// xtheadvector = riscv xtheadvector +NCNN_EXPORT int cpu_support_riscv_xtheadvector(); // vlenb = riscv vector length in bytes NCNN_EXPORT int cpu_riscv_vlenb(); diff --git a/src/layer.cpp b/src/layer.cpp index 3c40cbd67ca..d82595105ce 100644 --- a/src/layer.cpp +++ b/src/layer.cpp @@ -553,6 +553,13 @@ Layer* create_layer_cpu(int index) } else #endif // NCNN_RUNTIME_CPU && NCNN_RVV +#if NCNN_RUNTIME_CPU && NCNN_XTHEADVECTOR + if (ncnn::cpu_support_riscv_xtheadvector()) + { + layer_creator = layer_registry_xtheadvector[index].creator; + } + else +#endif // NCNN_RUNTIME_CPU && NCNN_XTHEADVECTOR { layer_creator = layer_registry_arch[index].creator; } diff --git a/src/layer/noop.cpp b/src/layer/noop.cpp index a8b42f70e83..b14f16ea88d 100644 --- a/src/layer/noop.cpp +++ b/src/layer/noop.cpp @@ -21,7 +21,7 @@ Noop::Noop() { support_inplace = true; support_packing = true; - support_fp16_storage = cpu_support_arm_asimdhp() || cpu_support_riscv_zfh(); + support_fp16_storage = cpu_support_arm_asimdhp() || cpu_support_riscv_zvfh(); support_bf16_storage = true; } diff --git a/src/layer/riscv/absval_riscv.cpp b/src/layer/riscv/absval_riscv.cpp index 092a8b5d6b1..805fe8f54c8 100644 --- a/src/layer/riscv/absval_riscv.cpp +++ b/src/layer/riscv/absval_riscv.cpp @@ -16,32 +16,36 @@ #if __riscv_vector #include - -static inline vfloat32m8_t vfabs_v_f32m8_absval(vfloat32m8_t op1, size_t vl) -{ - return vfsgnjx_vv_f32m8(op1, op1, vl); -} -static inline vfloat16m8_t vfabs_v_f16m8_absval(vfloat16m8_t op1, size_t vl) -{ - return vfsgnjx_vv_f16m8(op1, op1, vl); -} #endif // __riscv_vector +#include "cpu.h" + namespace ncnn { AbsVal_riscv::AbsVal_riscv() { #if __riscv_vector support_packing = true; -#if __riscv_zfh - support_fp16_storage = true; -#endif #endif // __riscv_vector +#if NCNN_ZFH +#if __riscv_vector + support_fp16_storage = cpu_support_riscv_zvfh(); +#else + support_fp16_storage = cpu_support_riscv_zfh(); +#endif +#endif } +#if __riscv_vector +static inline vfloat32m8_t __riscv_vfabs_v_f32m8_absval(vfloat32m8_t op1, size_t vl) +{ + return __riscv_vfsgnjx_vv_f32m8(op1, op1, vl); +} +#endif // __riscv_vector + int AbsVal_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const { -#if __riscv_vector && __riscv_zfh +#if NCNN_ZFH int elembits = bottom_top_blob.elembits(); if (opt.use_fp16_storage && elembits == 16) @@ -66,11 +70,11 @@ int AbsVal_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const int n = size; while (n > 0) { - size_t vl = vsetvl_e32m8(n); + size_t vl = __riscv_vsetvl_e32m8(n); - vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); - _p = vfabs_v_f32m8_absval(_p, vl); - vse32_v_f32m8(ptr, _p, vl); + vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr, vl); + _p = __riscv_vfabs_v_f32m8_absval(_p, vl); + __riscv_vse32_v_f32m8(ptr, _p, vl); ptr += vl; n -= vl; @@ -88,37 +92,4 @@ int AbsVal_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const return 0; } -#if __riscv_vector && __riscv_zfh -int AbsVal_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const -{ - int w = bottom_top_blob.w; - int h = bottom_top_blob.h; - int d = bottom_top_blob.d; - int channels = bottom_top_blob.c; - int elempack = bottom_top_blob.elempack; - int size = w * h * d * elempack; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - __fp16* ptr = bottom_top_blob.channel(q); - - int n = size; - while (n > 0) - { - size_t vl = vsetvl_e16m8(n); - - vfloat16m8_t _p = vle16_v_f16m8(ptr, vl); - _p = vfabs_v_f16m8_absval(_p, vl); - vse16_v_f16m8(ptr, _p, vl); - - ptr += vl; - n -= vl; - } - } - - return 0; -} -#endif - } // namespace ncnn diff --git a/src/layer/riscv/absval_riscv.h b/src/layer/riscv/absval_riscv.h index 0d35c6b61a0..ca9bde067fa 100644 --- a/src/layer/riscv/absval_riscv.h +++ b/src/layer/riscv/absval_riscv.h @@ -27,7 +27,7 @@ class AbsVal_riscv : public AbsVal virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; protected: -#if __riscv_vector && __riscv_zfh +#if NCNN_ZFH int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const; #endif }; diff --git a/src/layer/riscv/absval_riscv_zfh.cpp b/src/layer/riscv/absval_riscv_zfh.cpp new file mode 100644 index 00000000000..43bb6a2b0bc --- /dev/null +++ b/src/layer/riscv/absval_riscv_zfh.cpp @@ -0,0 +1,71 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "absval_riscv.h" + +#if __riscv_vector +#include +#endif // __riscv_vector + +namespace ncnn { + +#if __riscv_zvfh +static inline vfloat16m8_t __riscv_vfabs_v_f16m8_absval(vfloat16m8_t op1, size_t vl) +{ + return __riscv_vfsgnjx_vv_f16m8(op1, op1, vl); +} +#endif // __riscv_zvfh + +#if NCNN_ZFH +int AbsVal_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const +{ + const int w = bottom_top_blob.w; + const int h = bottom_top_blob.h; + const int d = bottom_top_blob.d; + const int channels = bottom_top_blob.c; + const int elempack = bottom_top_blob.elempack; + const int size = w * h * d * elempack; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + __fp16* ptr = bottom_top_blob.channel(q); + +#if __riscv_zvfh + int n = size; + while (n > 0) + { + size_t vl = __riscv_vsetvl_e16m8(n); + + vfloat16m8_t _p = __riscv_vle16_v_f16m8(ptr, vl); + _p = __riscv_vfabs_v_f16m8_absval(_p, vl); + __riscv_vse16_v_f16m8(ptr, _p, vl); + + ptr += vl; + n -= vl; + } +#else // __riscv_zvfh + for (int i = 0; i < size; i++) + { + *ptr = (*ptr > (__fp16)0.f) ? (*ptr) : (-*ptr); + ptr++; + } +#endif // __riscv_zvfh + } + + return 0; +} +#endif // NCNN_ZFH + +} // namespace ncnn diff --git a/src/layer/riscv/batchnorm_riscv.cpp b/src/layer/riscv/batchnorm_riscv.cpp index 2a8ec0cce58..e6e5af89033 100644 --- a/src/layer/riscv/batchnorm_riscv.cpp +++ b/src/layer/riscv/batchnorm_riscv.cpp @@ -16,9 +16,10 @@ #if __riscv_vector #include +#include "riscv_usability.h" #endif // __riscv_vector -#include "riscv_usability.h" +#include "cpu.h" namespace ncnn { @@ -26,18 +27,21 @@ BatchNorm_riscv::BatchNorm_riscv() { #if __riscv_vector support_packing = true; -#if __riscv_zfh - support_fp16_storage = true; -#endif #endif // __riscv_vector +#if NCNN_ZFH +#if __riscv_vector + support_fp16_storage = cpu_support_riscv_zvfh(); +#else + support_fp16_storage = cpu_support_riscv_zfh(); +#endif +#endif } int BatchNorm_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const { -#if __riscv_vector +#if NCNN_ZFH int elembits = bottom_top_blob.elembits(); -#if __riscv_zfh if (opt.use_fp16_storage && elembits == 16) { if (opt.use_fp16_arithmetic) @@ -46,8 +50,9 @@ int BatchNorm_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) co return forward_inplace_fp16s(bottom_top_blob, opt); } #endif + int elempack = bottom_top_blob.elempack; -#endif // __riscv_vector + int dims = bottom_top_blob.dims; if (dims == 1) { @@ -58,15 +63,15 @@ int BatchNorm_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) co int n = bottom_top_blob.w * elempack; while (n > 0) { - size_t vl = vsetvl_e32m8(n); + size_t vl = __riscv_vsetvl_e32m8(n); - vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); - vfloat32m8_t _a = vle32_v_f32m8(ptr_a, vl); - vfloat32m8_t _b = vle32_v_f32m8(ptr_b, vl); + vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr, vl); + vfloat32m8_t _a = __riscv_vle32_v_f32m8(ptr_a, vl); + vfloat32m8_t _b = __riscv_vle32_v_f32m8(ptr_b, vl); - _p = vfmadd_vv_f32m8(_p, _b, _a, vl); + _p = __riscv_vfmadd_vv_f32m8(_p, _b, _a, vl); - vse32_v_f32m8(ptr, _p, vl); + __riscv_vse32_v_f32m8(ptr, _p, vl); ptr += vl; ptr_a += vl; @@ -75,7 +80,6 @@ int BatchNorm_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) co } #else int w = bottom_top_blob.w; - #pragma omp parallel for num_threads(opt.num_threads) for (int i = 0; i < w; i++) { ptr[i] = b_data[i] * ptr[i] + a_data[i]; @@ -103,11 +107,11 @@ int BatchNorm_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) co int n = w; while (n > 0) { - size_t vl = vsetvl_e32m8(n); - vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); - _p = vfmul_vf_f32m8(_p, b, vl); - _p = vfadd_vf_f32m8(_p, a, vl); - vse32_v_f32m8(ptr, _p, vl); + size_t vl = __riscv_vsetvl_e32m8(n); + vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr, vl); + _p = __riscv_vfmul_vf_f32m8(_p, b, vl); + _p = __riscv_vfadd_vf_f32m8(_p, a, vl); + __riscv_vse32_v_f32m8(ptr, _p, vl); ptr += vl; n -= vl; @@ -137,11 +141,11 @@ int BatchNorm_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) co int n = size; while (n > 0) { - size_t vl = vsetvl_e32m8(n); - vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); - _p = vfmul_vf_f32m8(_p, b, vl); - _p = vfadd_vf_f32m8(_p, a, vl); - vse32_v_f32m8(ptr, _p, vl); + size_t vl = __riscv_vsetvl_e32m8(n); + vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr, vl); + _p = __riscv_vfmul_vf_f32m8(_p, b, vl); + _p = __riscv_vfadd_vf_f32m8(_p, a, vl); + __riscv_vse32_v_f32m8(ptr, _p, vl); ptr += vl; n -= vl; @@ -164,7 +168,7 @@ int BatchNorm_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) co int w = bottom_top_blob.w; int h = bottom_top_blob.h; - const size_t vl = vsetvl_e32m1(packn); + const size_t vl = __riscv_vsetvl_e32m1(packn); if (dims == 2) { #pragma omp parallel for num_threads(opt.num_threads) @@ -177,13 +181,13 @@ int BatchNorm_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) co ptr_b += i * elempack; int n = w * elempack; - vfloat32m1_t _a = vle32_v_f32m1(ptr_a, vl); - vfloat32m1_t _b = vle32_v_f32m1(ptr_b, vl); + vfloat32m1_t _a = __riscv_vle32_v_f32m1(ptr_a, vl); + vfloat32m1_t _b = __riscv_vle32_v_f32m1(ptr_b, vl); while (n > 0) { - vfloat32m1_t _p = vle32_v_f32m1(ptr, vl); - _p = vfmadd_vv_f32m1(_p, _b, _a, vl); - vse32_v_f32m1(ptr, _p, vl); + vfloat32m1_t _p = __riscv_vle32_v_f32m1(ptr, vl); + _p = __riscv_vfmadd_vv_f32m1(_p, _b, _a, vl); + __riscv_vse32_v_f32m1(ptr, _p, vl); ptr += vl; n -= vl; @@ -204,15 +208,15 @@ int BatchNorm_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) co const float* ptr_a = (const float*)a_data + q * elempack; const float* ptr_b = (const float*)b_data + q * elempack; - vfloat32m1_t _a = vle32_v_f32m1(ptr_a, vl); - vfloat32m1_t _b = vle32_v_f32m1(ptr_b, vl); + vfloat32m1_t _a = __riscv_vle32_v_f32m1(ptr_a, vl); + vfloat32m1_t _b = __riscv_vle32_v_f32m1(ptr_b, vl); int n = size; while (n > 0) { - vfloat32m1_t _p = vle32_v_f32m1(ptr, vl); - _p = vfmadd_vv_f32m1(_p, _b, _a, vl); - vse32_v_f32m1(ptr, _p, vl); + vfloat32m1_t _p = __riscv_vle32_v_f32m1(ptr, vl); + _p = __riscv_vfmadd_vv_f32m1(_p, _b, _a, vl); + __riscv_vse32_v_f32m1(ptr, _p, vl); ptr += vl; n -= vl; @@ -224,314 +228,4 @@ int BatchNorm_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) co return 0; } -#if __riscv_vector && __riscv_zfh -int BatchNorm_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const -{ - int dims = bottom_top_blob.dims; - int elempack = bottom_top_blob.elempack; - if (dims == 1) - { - int n = bottom_top_blob.w * elempack; - __fp16* ptr = bottom_top_blob; - const float* ptr_a = a_data; - const float* ptr_b = b_data; - while (n > 0) - { - size_t vl = vsetvl_e16m4(n); - - vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr, vl), vl); - vfloat32m8_t _a = vle32_v_f32m8(ptr_a, vl); - vfloat32m8_t _b = vle32_v_f32m8(ptr_b, vl); - - _p = vfmadd_vv_f32m8(_p, _b, _a, vl); - - vse16_v_f16m4(ptr, vfncvt_f_f_w_f16m4(_p, vl), vl); - - ptr += vl; - ptr_a += vl; - ptr_b += vl; - n -= vl; - } - - return 0; - } - - if (elempack == 1) - { - int w = bottom_top_blob.w; - int h = bottom_top_blob.h; - if (dims == 2) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - __fp16* ptr = bottom_top_blob.row<__fp16>(i); - float a = a_data[i]; - float b = b_data[i]; - - int n = w; - while (n > 0) - { - size_t vl = vsetvl_e16m4(n); - vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr, vl), vl); - _p = vfmul_vf_f32m8(_p, b, vl); - _p = vfadd_vf_f32m8(_p, a, vl); - vse16_v_f16m4(ptr, vfncvt_f_f_w_f16m4(_p, vl), vl); - - ptr += vl; - n -= vl; - } - } - } - if (dims == 3 || dims == 4) - { - int d = bottom_top_blob.d; - int c = bottom_top_blob.c; - int size = w * h * d; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < c; q++) - { - __fp16* ptr = bottom_top_blob.channel(q); - float a = a_data[q]; - float b = b_data[q]; - - int n = size; - while (n > 0) - { - size_t vl = vsetvl_e16m4(n); - vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr, vl), vl); - ; - _p = vfmul_vf_f32m8(_p, b, vl); - _p = vfadd_vf_f32m8(_p, a, vl); - vse16_v_f16m4(ptr, vfncvt_f_f_w_f16m4(_p, vl), vl); - - ptr += vl; - n -= vl; - } - } - } - - return 0; - } - - const int packn = csrr_vlenb() / 2; // fp16 - if (elempack == packn) - { - int w = bottom_top_blob.w; - int h = bottom_top_blob.h; - - const size_t vl = vsetvl_e16m1(packn); - if (dims == 2) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - __fp16* ptr = bottom_top_blob.row<__fp16>(i); - const float* ptr_a = (const float*)a_data + i * elempack; - const float* ptr_b = (const float*)b_data + i * elempack; - int n = w * elempack; - - vfloat32m2_t _a = vle32_v_f32m2(ptr_a, vl); - vfloat32m2_t _b = vle32_v_f32m2(ptr_b, vl); - while (n > 0) - { - vfloat32m2_t _p = vfwcvt_f_f_v_f32m2(vle16_v_f16m1(ptr, vl), vl); - _p = vfmadd_vv_f32m2(_p, _b, _a, vl); - vse16_v_f16m1(ptr, vfncvt_f_f_w_f16m1(_p, vl), vl); - - ptr += vl; - n -= vl; - } - } - } - - if (dims == 3 || dims == 4) - { - int d = bottom_top_blob.d; - int c = bottom_top_blob.c; - int size = w * h * d * elempack; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < c; q++) - { - __fp16* ptr = bottom_top_blob.channel(q); - const float* ptr_a = (const float*)a_data + q * elempack; - const float* ptr_b = (const float*)b_data + q * elempack; - - vfloat32m2_t _a = vle32_v_f32m2(ptr_a, vl); - vfloat32m2_t _b = vle32_v_f32m2(ptr_b, vl); - - int n = size; - while (n > 0) - { - vfloat32m2_t _p = vfwcvt_f_f_v_f32m2(vle16_v_f16m1(ptr, vl), vl); - _p = vfmadd_vv_f32m2(_p, _b, _a, vl); - vse16_v_f16m1(ptr, vfncvt_f_f_w_f16m1(_p, vl), vl); - - ptr += vl; - n -= vl; - } - } - } - } - - return 0; -} - -int BatchNorm_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const -{ - int dims = bottom_top_blob.dims; - int elempack = bottom_top_blob.elempack; - if (dims == 1) - { - int n = bottom_top_blob.w * elempack; - __fp16* ptr = bottom_top_blob; - const float* ptr_a = a_data; - const float* ptr_b = b_data; - while (n > 0) - { - size_t vl = vsetvl_e16m4(n); - - vfloat16m4_t _p = vle16_v_f16m4(ptr, vl); - vfloat16m4_t _a = vfncvt_f_f_w_f16m4(vle32_v_f32m8(ptr_a, vl), vl); - vfloat16m4_t _b = vfncvt_f_f_w_f16m4(vle32_v_f32m8(ptr_b, vl), vl); - - _p = vfmadd_vv_f16m4(_p, _b, _a, vl); - - vse16_v_f16m4(ptr, _p, vl); - - ptr += vl; - ptr_a += vl; - ptr_b += vl; - n -= vl; - } - - return 0; - } - - if (elempack == 1) - { - int w = bottom_top_blob.w; - int h = bottom_top_blob.h; - if (dims == 2) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - __fp16* ptr = bottom_top_blob.row<__fp16>(i); - float a = a_data[i]; - float b = b_data[i]; - - int n = w; - while (n > 0) - { - size_t vl = vsetvl_e16m8(n); - vfloat16m8_t _p = vle16_v_f16m8(ptr, vl); - _p = vfmul_vf_f16m8(_p, b, vl); - _p = vfadd_vf_f16m8(_p, a, vl); - vse16_v_f16m8(ptr, _p, vl); - - ptr += vl; - n -= vl; - } - } - } - if (dims == 3 || dims == 4) - { - int d = bottom_top_blob.d; - int c = bottom_top_blob.c; - int size = w * h * d; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < c; q++) - { - __fp16* ptr = bottom_top_blob.channel(q); - float a = a_data[q]; - float b = b_data[q]; - - int n = size; - while (n > 0) - { - size_t vl = vsetvl_e16m8(n); - vfloat16m8_t _p = vle16_v_f16m8(ptr, vl); - ; - _p = vfmul_vf_f16m8(_p, b, vl); - _p = vfadd_vf_f16m8(_p, a, vl); - vse16_v_f16m8(ptr, _p, vl); - - ptr += vl; - n -= vl; - } - } - } - - return 0; - } - - const int packn = csrr_vlenb() / 2; // fp16 - if (elempack == packn) - { - int w = bottom_top_blob.w; - int h = bottom_top_blob.h; - - const size_t vl = vsetvl_e16m1(packn); - if (dims == 2) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - __fp16* ptr = bottom_top_blob.row<__fp16>(i); - const float* ptr_a = (const float*)a_data + i * elempack; - const float* ptr_b = (const float*)b_data + i * elempack; - int n = w * elempack; - - vfloat16m1_t _a = vfncvt_f_f_w_f16m1(vle32_v_f32m2(ptr_a, vl), vl); - vfloat16m1_t _b = vfncvt_f_f_w_f16m1(vle32_v_f32m2(ptr_b, vl), vl); - while (n > 0) - { - vfloat16m1_t _p = vle16_v_f16m1(ptr, vl); - _p = vfmadd_vv_f16m1(_p, _b, _a, vl); - vse16_v_f16m1(ptr, _p, vl); - - ptr += vl; - n -= vl; - } - } - } - - if (dims == 3 || dims == 4) - { - int d = bottom_top_blob.d; - int c = bottom_top_blob.c; - int size = w * h * d * elempack; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < c; q++) - { - __fp16* ptr = bottom_top_blob.channel(q); - const float* ptr_a = (const float*)a_data + q * elempack; - const float* ptr_b = (const float*)b_data + q * elempack; - - vfloat16m1_t _a = vfncvt_f_f_w_f16m1(vle32_v_f32m2(ptr_a, vl), vl); - vfloat16m1_t _b = vfncvt_f_f_w_f16m1(vle32_v_f32m2(ptr_b, vl), vl); - - int n = size; - while (n > 0) - { - vfloat16m1_t _p = vle16_v_f16m1(ptr, vl); - _p = vfmadd_vv_f16m1(_p, _b, _a, vl); - vse16_v_f16m1(ptr, _p, vl); - - ptr += vl; - n -= vl; - } - } - } - } - - return 0; -} - -#endif // __riscv_vector && __riscv_zfh } // namespace ncnn diff --git a/src/layer/riscv/batchnorm_riscv.h b/src/layer/riscv/batchnorm_riscv.h index 1ed4dc63d0d..9f9b105cae1 100644 --- a/src/layer/riscv/batchnorm_riscv.h +++ b/src/layer/riscv/batchnorm_riscv.h @@ -26,7 +26,7 @@ class BatchNorm_riscv : public BatchNorm virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; protected: -#if __riscv_vector && __riscv_zfh +#if NCNN_ZFH int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const; int forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const; #endif diff --git a/src/layer/riscv/batchnorm_riscv_zfh.cpp b/src/layer/riscv/batchnorm_riscv_zfh.cpp new file mode 100644 index 00000000000..bd2a9be289a --- /dev/null +++ b/src/layer/riscv/batchnorm_riscv_zfh.cpp @@ -0,0 +1,380 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "batchnorm_riscv.h" + +#if __riscv_vector +#include +#include "riscv_usability.h" +#endif // __riscv_vector + +namespace ncnn { + +#if NCNN_ZFH +int BatchNorm_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const +{ + int dims = bottom_top_blob.dims; + int elempack = bottom_top_blob.elempack; + if (dims == 1) + { + __fp16* ptr = bottom_top_blob; +#if __riscv_zvfh + const float* ptr_a = a_data; + const float* ptr_b = b_data; + int n = bottom_top_blob.w * elempack; + while (n > 0) + { + size_t vl = __riscv_vsetvl_e16m4(n); + + vfloat32m8_t _p = __riscv_vfwcvt_f_f_v_f32m8(__riscv_vle16_v_f16m4(ptr, vl), vl); + vfloat32m8_t _a = __riscv_vle32_v_f32m8(ptr_a, vl); + vfloat32m8_t _b = __riscv_vle32_v_f32m8(ptr_b, vl); + + _p = __riscv_vfmadd_vv_f32m8(_p, _b, _a, vl); + + __riscv_vse16_v_f16m4(ptr, __riscv_vfncvt_f_f_w_f16m4(_p, vl), vl); + + ptr += vl; + ptr_a += vl; + ptr_b += vl; + n -= vl; + } +#else // __riscv_zvfh + int w = bottom_top_blob.w; + for (int i = 0; i < w; i++) + { + ptr[i] = (__fp16)(b_data[i] * (float)ptr[i] + a_data[i]); + } +#endif // __riscv_zvfh + + return 0; + } + + if (elempack == 1) + { + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + if (dims == 2) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + __fp16* ptr = bottom_top_blob.row<__fp16>(i); + float a = a_data[i]; + float b = b_data[i]; + +#if __riscv_zvfh + int n = w; + while (n > 0) + { + size_t vl = __riscv_vsetvl_e16m4(n); + vfloat32m8_t _p = __riscv_vfwcvt_f_f_v_f32m8(__riscv_vle16_v_f16m4(ptr, vl), vl); + _p = __riscv_vfmul_vf_f32m8(_p, b, vl); + _p = __riscv_vfadd_vf_f32m8(_p, a, vl); + __riscv_vse16_v_f16m4(ptr, __riscv_vfncvt_f_f_w_f16m4(_p, vl), vl); + + ptr += vl; + n -= vl; + } +#else // __riscv_zvfh + for (int j = 0; j < w; j++) + { + ptr[j] = (__fp16)(b * (float)ptr[j] + a); + } +#endif // __riscv_zvfh + } + } + if (dims == 3 || dims == 4) + { + int d = bottom_top_blob.d; + int c = bottom_top_blob.c; + int size = w * h * d; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < c; q++) + { + __fp16* ptr = bottom_top_blob.channel(q); + float a = a_data[q]; + float b = b_data[q]; + +#if __riscv_zvfh + int n = size; + while (n > 0) + { + size_t vl = __riscv_vsetvl_e16m4(n); + vfloat32m8_t _p = __riscv_vfwcvt_f_f_v_f32m8(__riscv_vle16_v_f16m4(ptr, vl), vl); + _p = __riscv_vfmul_vf_f32m8(_p, b, vl); + _p = __riscv_vfadd_vf_f32m8(_p, a, vl); + __riscv_vse16_v_f16m4(ptr, __riscv_vfncvt_f_f_w_f16m4(_p, vl), vl); + + ptr += vl; + n -= vl; + } +#else // __riscv_zvfh + for (int i = 0; i < size; i++) + { + ptr[i] = (__fp16)(b * (float)ptr[i] + a); + } +#endif // __riscv_zvfh + } + } + + return 0; + } + +#if __riscv_zvfh + const int packn = csrr_vlenb() / 2; // fp16 + if (elempack == packn) + { + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + + const size_t vl = __riscv_vsetvl_e16m1(packn); + if (dims == 2) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + __fp16* ptr = bottom_top_blob.row<__fp16>(i); + const float* ptr_a = (const float*)a_data + i * elempack; + const float* ptr_b = (const float*)b_data + i * elempack; + int n = w * elempack; + + vfloat32m2_t _a = __riscv_vle32_v_f32m2(ptr_a, vl); + vfloat32m2_t _b = __riscv_vle32_v_f32m2(ptr_b, vl); + while (n > 0) + { + vfloat32m2_t _p = __riscv_vfwcvt_f_f_v_f32m2(__riscv_vle16_v_f16m1(ptr, vl), vl); + _p = __riscv_vfmadd_vv_f32m2(_p, _b, _a, vl); + __riscv_vse16_v_f16m1(ptr, __riscv_vfncvt_f_f_w_f16m1(_p, vl), vl); + + ptr += vl; + n -= vl; + } + } + } + + if (dims == 3 || dims == 4) + { + int d = bottom_top_blob.d; + int c = bottom_top_blob.c; + int size = w * h * d * elempack; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < c; q++) + { + __fp16* ptr = bottom_top_blob.channel(q); + const float* ptr_a = (const float*)a_data + q * elempack; + const float* ptr_b = (const float*)b_data + q * elempack; + + vfloat32m2_t _a = __riscv_vle32_v_f32m2(ptr_a, vl); + vfloat32m2_t _b = __riscv_vle32_v_f32m2(ptr_b, vl); + + int n = size; + while (n > 0) + { + vfloat32m2_t _p = __riscv_vfwcvt_f_f_v_f32m2(__riscv_vle16_v_f16m1(ptr, vl), vl); + _p = __riscv_vfmadd_vv_f32m2(_p, _b, _a, vl); + __riscv_vse16_v_f16m1(ptr, __riscv_vfncvt_f_f_w_f16m1(_p, vl), vl); + + ptr += vl; + n -= vl; + } + } + } + } +#endif // __riscv_zvfh + + return 0; +} + +int BatchNorm_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const +{ + int dims = bottom_top_blob.dims; + int elempack = bottom_top_blob.elempack; + if (dims == 1) + { + __fp16* ptr = bottom_top_blob; +#if __riscv_zvfh + const float* ptr_a = a_data; + const float* ptr_b = b_data; + int n = bottom_top_blob.w * elempack; + while (n > 0) + { + size_t vl = __riscv_vsetvl_e16m4(n); + + vfloat16m4_t _p = __riscv_vle16_v_f16m4(ptr, vl); + vfloat16m4_t _a = __riscv_vfncvt_f_f_w_f16m4(__riscv_vle32_v_f32m8(ptr_a, vl), vl); + vfloat16m4_t _b = __riscv_vfncvt_f_f_w_f16m4(__riscv_vle32_v_f32m8(ptr_b, vl), vl); + + _p = __riscv_vfmadd_vv_f16m4(_p, _b, _a, vl); + + __riscv_vse16_v_f16m4(ptr, _p, vl); + + ptr += vl; + ptr_a += vl; + ptr_b += vl; + n -= vl; + } +#else // __riscv_zvfh + int w = bottom_top_blob.w; + for (int i = 0; i < w; i++) + { + ptr[i] = (__fp16)b_data[i] * ptr[i] + (__fp16)a_data[i]; + } +#endif // __riscv_zvfh + + return 0; + } + + if (elempack == 1) + { + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + if (dims == 2) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + __fp16* ptr = bottom_top_blob.row<__fp16>(i); + __fp16 a = (__fp16)a_data[i]; + __fp16 b = (__fp16)b_data[i]; + +#if __riscv_zvfh + int n = w; + while (n > 0) + { + size_t vl = __riscv_vsetvl_e16m8(n); + vfloat16m8_t _p = __riscv_vle16_v_f16m8(ptr, vl); + _p = __riscv_vfmul_vf_f16m8(_p, b, vl); + _p = __riscv_vfadd_vf_f16m8(_p, a, vl); + __riscv_vse16_v_f16m8(ptr, _p, vl); + + ptr += vl; + n -= vl; + } +#else // __riscv_zvfh + for (int j = 0; j < w; j++) + { + ptr[j] = b * ptr[j] + a; + } +#endif // __riscv_zvfh + } + } + if (dims == 3 || dims == 4) + { + int d = bottom_top_blob.d; + int c = bottom_top_blob.c; + int size = w * h * d; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < c; q++) + { + __fp16* ptr = bottom_top_blob.channel(q); + __fp16 a = (__fp16)a_data[q]; + __fp16 b = (__fp16)b_data[q]; + +#if __riscv_zvfh + int n = size; + while (n > 0) + { + size_t vl = __riscv_vsetvl_e16m8(n); + vfloat16m8_t _p = __riscv_vle16_v_f16m8(ptr, vl); + _p = __riscv_vfmul_vf_f16m8(_p, b, vl); + _p = __riscv_vfadd_vf_f16m8(_p, a, vl); + __riscv_vse16_v_f16m8(ptr, _p, vl); + + ptr += vl; + n -= vl; + } +#else // __riscv_zvfh + for (int i = 0; i < size; i++) + { + ptr[i] = b * ptr[i] + a; + } +#endif // __riscv_zvfh + } + } + + return 0; + } + +#if __riscv_zvfh + const int packn = csrr_vlenb() / 2; // fp16 + if (elempack == packn) + { + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + + const size_t vl = __riscv_vsetvl_e16m1(packn); + if (dims == 2) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + __fp16* ptr = bottom_top_blob.row<__fp16>(i); + const float* ptr_a = (const float*)a_data + i * elempack; + const float* ptr_b = (const float*)b_data + i * elempack; + int n = w * elempack; + + vfloat16m1_t _a = __riscv_vfncvt_f_f_w_f16m1(__riscv_vle32_v_f32m2(ptr_a, vl), vl); + vfloat16m1_t _b = __riscv_vfncvt_f_f_w_f16m1(__riscv_vle32_v_f32m2(ptr_b, vl), vl); + while (n > 0) + { + vfloat16m1_t _p = __riscv_vle16_v_f16m1(ptr, vl); + _p = __riscv_vfmadd_vv_f16m1(_p, _b, _a, vl); + __riscv_vse16_v_f16m1(ptr, _p, vl); + + ptr += vl; + n -= vl; + } + } + } + + if (dims == 3 || dims == 4) + { + int d = bottom_top_blob.d; + int c = bottom_top_blob.c; + int size = w * h * d * elempack; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < c; q++) + { + __fp16* ptr = bottom_top_blob.channel(q); + const float* ptr_a = (const float*)a_data + q * elempack; + const float* ptr_b = (const float*)b_data + q * elempack; + + vfloat16m1_t _a = __riscv_vfncvt_f_f_w_f16m1(__riscv_vle32_v_f32m2(ptr_a, vl), vl); + vfloat16m1_t _b = __riscv_vfncvt_f_f_w_f16m1(__riscv_vle32_v_f32m2(ptr_b, vl), vl); + + int n = size; + while (n > 0) + { + vfloat16m1_t _p = __riscv_vle16_v_f16m1(ptr, vl); + _p = __riscv_vfmadd_vv_f16m1(_p, _b, _a, vl); + __riscv_vse16_v_f16m1(ptr, _p, vl); + + ptr += vl; + n -= vl; + } + } + } + } +#endif // __riscv_zvfh + + return 0; +} +#endif // NCNN_ZFH + +} // namespace ncnn diff --git a/src/layer/riscv/binaryop_riscv.cpp b/src/layer/riscv/binaryop_riscv.cpp index da4593197f4..f7d750614cc 100644 --- a/src/layer/riscv/binaryop_riscv.cpp +++ b/src/layer/riscv/binaryop_riscv.cpp @@ -20,10 +20,10 @@ #if __riscv_vector #include #include "rvv_mathfun.h" -#include "rvv_mathfun_fp16s.h" +#include "riscv_usability.h" #endif // __riscv_vector -#include "riscv_usability.h" +#include "cpu.h" namespace ncnn { @@ -31,8 +31,12 @@ BinaryOp_riscv::BinaryOp_riscv() { #if __riscv_vector support_packing = true; -#if __riscv_zfh - support_fp16_storage = true; +#endif // __riscv_vector +#if NCNN_ZFH +#if __riscv_vector + support_fp16_storage = cpu_support_riscv_zvfh(); +#else + support_fp16_storage = cpu_support_riscv_zfh(); #endif #endif } @@ -46,11 +50,11 @@ static void binary_op_vector_no_broadcast(const float* ptr, const float* ptr1, f int n = size; while (n > 0) { - size_t vl = vsetvl_e32m8(n); - vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); - vfloat32m8_t _p1 = vle32_v_f32m8(ptr1, vl); + size_t vl = __riscv_vsetvl_e32m8(n); + vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr, vl); + vfloat32m8_t _p1 = __riscv_vle32_v_f32m8(ptr1, vl); vfloat32m8_t _outp = op(_p, _p1, vl); - vse32_v_f32m8(outptr, _outp, vl); + __riscv_vse32_v_f32m8(outptr, _outp, vl); n -= vl; ptr += vl; ptr1 += vl; @@ -76,13 +80,13 @@ static void binary_op_vector_broadcast_b(const float* ptr, const float* ptr1, fl #if __riscv_vector int n = size; - vfloat32m8_t _bx = (elempack == 1) ? vfmv_v_f_f32m8(b, vsetvl_e32m8(n)) : vle32_v_f32m8_f32m1(ptr1); + vfloat32m8_t _bx = (elempack == 1) ? __riscv_vfmv_v_f_f32m8(b, __riscv_vsetvl_e32m8(n)) : __riscv_vle32_v_f32m8_f32m1(ptr1); while (n > 0) { - size_t vl = vsetvl_e32m8(n); - vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); + size_t vl = __riscv_vsetvl_e32m8(n); + vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr, vl); vfloat32m8_t _outp = op(_p, _bx, vl); - vse32_v_f32m8(outptr, _outp, vl); + __riscv_vse32_v_f32m8(outptr, _outp, vl); n -= vl; ptr += vl; outptr += vl; @@ -106,13 +110,13 @@ static void binary_op_vector_broadcast_a(const float* ptr, const float* ptr1, fl #if __riscv_vector int n = size; - vfloat32m8_t _ax = (elempack == 1) ? vfmv_v_f_f32m8(a, vsetvl_e32m8(n)) : vle32_v_f32m8_f32m1(ptr); + vfloat32m8_t _ax = (elempack == 1) ? __riscv_vfmv_v_f_f32m8(a, __riscv_vsetvl_e32m8(n)) : __riscv_vle32_v_f32m8_f32m1(ptr); while (n > 0) { - size_t vl = vsetvl_e32m8(n); - vfloat32m8_t _p = vle32_v_f32m8(ptr1, vl); + size_t vl = __riscv_vsetvl_e32m8(n); + vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr1, vl); vfloat32m8_t _outp = op(_ax, _p, vl); - vse32_v_f32m8(outptr, _outp, vl); + __riscv_vse32_v_f32m8(outptr, _outp, vl); n -= vl; ptr1 += vl; outptr += vl; @@ -135,13 +139,13 @@ static void binary_op_vector_broadcast_pb(const float* ptr, const float* ptr1, f #if __riscv_vector // if (elempack == packn) { - size_t vl = vsetvl_e32m8(elempack); + size_t vl = __riscv_vsetvl_e32m8(elempack); int i = 0; for (; i < w; i++) { - vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); + vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr, vl); vfloat32m8_t _outp = op(_p, *ptr1, vl); - vse32_v_f32m8(outptr, _outp, vl); + __riscv_vse32_v_f32m8(outptr, _outp, vl); ptr += vl; ptr1 += 1; outptr += vl; @@ -158,13 +162,13 @@ static void binary_op_vector_broadcast_pb_b(const float* ptr, const float* ptr1, #if __riscv_vector int n = w * elempack; - vfloat32m8_t _bx = vfmv_v_f_f32m8(*ptr1, vsetvl_e32m8(n)); + vfloat32m8_t _bx = __riscv_vfmv_v_f_f32m8(*ptr1, __riscv_vsetvl_e32m8(n)); while (n > 0) { - size_t vl = vsetvl_e32m8(n); - vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); + size_t vl = __riscv_vsetvl_e32m8(n); + vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr, vl); vfloat32m8_t _outp = op(_p, _bx, vl); - vse32_v_f32m8(outptr, _outp, vl); + __riscv_vse32_v_f32m8(outptr, _outp, vl); n -= vl; ptr += vl; outptr += vl; @@ -180,12 +184,12 @@ static void binary_op_vector_broadcast_pb_a(const float* ptr, const float* ptr1, #if __riscv_vector // if (elempack == packn) { - size_t vl = vsetvl_e32m8(elempack); - vfloat32m8_t _ax = vle32_v_f32m8_f32m1(ptr); + size_t vl = __riscv_vsetvl_e32m8(elempack); + vfloat32m8_t _ax = __riscv_vle32_v_f32m8_f32m1(ptr); for (int i = 0; i < w; i++) { vfloat32m8_t _outp = op(_ax, *ptr1, vl); - vse32_v_f32m8(outptr, _outp, vl); + __riscv_vse32_v_f32m8(outptr, _outp, vl); ptr1 += 1; outptr += vl; } @@ -281,18 +285,18 @@ namespace BinaryOp_riscv_functor { // clang-format off // *INDENT-OFF* -MAKE_FUNCTION(binary_op_add, x + y, vfadd_vv_f32m8(x, y, vl), vfadd_vf_f32m8(x, y, vl), vfadd_vf_f32m8(y, x, vl)) -MAKE_FUNCTION(binary_op_sub, x - y, vfsub_vv_f32m8(x, y, vl), vfsub_vf_f32m8(x, y, vl), vfrsub_vf_f32m8(y, x, vl)) -MAKE_FUNCTION(binary_op_mul, x * y, vfmul_vv_f32m8(x, y, vl), vfmul_vf_f32m8(x, y, vl), vfmul_vf_f32m8(y, x, vl)) -MAKE_FUNCTION(binary_op_div, x / y, vfdiv_vv_f32m8(x, y, vl), vfdiv_vf_f32m8(x, y, vl), vfrdiv_vf_f32m8(y, x, vl)) -MAKE_FUNCTION(binary_op_max, std::max(x, y), vfmax_vv_f32m8(x, y, vl), vfmax_vf_f32m8(x, y, vl), vfmax_vf_f32m8(y, x, vl)) -MAKE_FUNCTION(binary_op_min, std::min(x, y), vfmin_vv_f32m8(x, y, vl), vfmin_vf_f32m8(x, y, vl), vfmin_vf_f32m8(y, x, vl)) -MAKE_FUNCTION(binary_op_pow, (float)pow(x, y), pow_ps(x, y, vl), pow_ps(x, vfmv_v_f_f32m8(y, vl), vl), pow_ps(vfmv_v_f_f32m8(x, vl), y, vl)) -MAKE_FUNCTION(binary_op_rsub, y - x, vfsub_vv_f32m8(y, x, vl), vfrsub_vf_f32m8(x, y, vl), vfsub_vf_f32m8(y, x, vl)) -MAKE_FUNCTION(binary_op_rdiv, y / x, vfdiv_vv_f32m8(y, x, vl), vfrdiv_vf_f32m8(x, y, vl), vfdiv_vf_f32m8(y, x, vl)) -MAKE_FUNCTION(binary_op_rpow, (float)pow(y, x), pow_ps(y, x, vl), pow_ps(vfmv_v_f_f32m8(y, vl), x, vl), pow_ps(y, vfmv_v_f_f32m8(x, vl), vl)) -MAKE_FUNCTION(binary_op_atan2, (float)atan2(x, y), atan2_ps(x, y, vl), atan2_ps(x, vfmv_v_f_f32m8(y, vl), vl), atan2_ps(vfmv_v_f_f32m8(x, vl), y, vl)) -MAKE_FUNCTION(binary_op_ratan2, (float)atan2(y, x), atan2_ps(y, x, vl), atan2_ps(vfmv_v_f_f32m8(y, vl), x, vl), atan2_ps(y, vfmv_v_f_f32m8(x, vl), vl)) +MAKE_FUNCTION(binary_op_add, x + y, __riscv_vfadd_vv_f32m8(x, y, vl), __riscv_vfadd_vf_f32m8(x, y, vl), __riscv_vfadd_vf_f32m8(y, x, vl)) +MAKE_FUNCTION(binary_op_sub, x - y, __riscv_vfsub_vv_f32m8(x, y, vl), __riscv_vfsub_vf_f32m8(x, y, vl), __riscv_vfrsub_vf_f32m8(y, x, vl)) +MAKE_FUNCTION(binary_op_mul, x * y, __riscv_vfmul_vv_f32m8(x, y, vl), __riscv_vfmul_vf_f32m8(x, y, vl), __riscv_vfmul_vf_f32m8(y, x, vl)) +MAKE_FUNCTION(binary_op_div, x / y, __riscv_vfdiv_vv_f32m8(x, y, vl), __riscv_vfdiv_vf_f32m8(x, y, vl), __riscv_vfrdiv_vf_f32m8(y, x, vl)) +MAKE_FUNCTION(binary_op_max, std::max(x, y), __riscv_vfmax_vv_f32m8(x, y, vl), __riscv_vfmax_vf_f32m8(x, y, vl), __riscv_vfmax_vf_f32m8(y, x, vl)) +MAKE_FUNCTION(binary_op_min, std::min(x, y), __riscv_vfmin_vv_f32m8(x, y, vl), __riscv_vfmin_vf_f32m8(x, y, vl), __riscv_vfmin_vf_f32m8(y, x, vl)) +MAKE_FUNCTION(binary_op_pow, (float)pow(x, y), pow_ps(x, y, vl), pow_ps(x, __riscv_vfmv_v_f_f32m8(y, vl), vl), pow_ps(__riscv_vfmv_v_f_f32m8(x, vl), y, vl)) +MAKE_FUNCTION(binary_op_rsub, y - x, __riscv_vfsub_vv_f32m8(y, x, vl), __riscv_vfrsub_vf_f32m8(x, y, vl), __riscv_vfsub_vf_f32m8(y, x, vl)) +MAKE_FUNCTION(binary_op_rdiv, y / x, __riscv_vfdiv_vv_f32m8(y, x, vl), __riscv_vfrdiv_vf_f32m8(x, y, vl), __riscv_vfdiv_vf_f32m8(y, x, vl)) +MAKE_FUNCTION(binary_op_rpow, (float)pow(y, x), pow_ps(y, x, vl), pow_ps(__riscv_vfmv_v_f_f32m8(y, vl), x, vl), pow_ps(y, __riscv_vfmv_v_f_f32m8(x, vl), vl)) +MAKE_FUNCTION(binary_op_atan2, (float)atan2(x, y), atan2_ps(x, y, vl), atan2_ps(x, __riscv_vfmv_v_f_f32m8(y, vl), vl), atan2_ps(__riscv_vfmv_v_f_f32m8(x, vl), y, vl)) +MAKE_FUNCTION(binary_op_ratan2, (float)atan2(y, x), atan2_ps(y, x, vl), atan2_ps(__riscv_vfmv_v_f_f32m8(y, vl), x, vl), atan2_ps(y, __riscv_vfmv_v_f_f32m8(x, vl), vl)) // *INDENT-ON* // clang-format on @@ -469,9 +473,9 @@ static int get_reverse_op_type(int op_type) int BinaryOp_riscv::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const { +#if NCNN_ZFH int elembits = std::max(bottom_blobs[0].elembits(), bottom_blobs[1].elembits()); -#if __riscv_vector && __riscv_zfh if (opt.use_fp16_storage && elembits == 16) { return forward_fp16s(bottom_blobs, top_blobs, opt); @@ -627,9 +631,9 @@ int BinaryOp_riscv::forward(const std::vector& bottom_blobs, std::vector -static void binary_op_vector_no_broadcast_fp16s(const __fp16* ptr, const __fp16* ptr1, __fp16* outptr, int size) -{ - const Op op; - -#if __riscv_vector - int n = size; - while (n > 0) - { - size_t vl = vsetvl_e16m8(n); - vfloat16m8_t _p = vle16_v_f16m8(ptr, vl); - vfloat16m8_t _p1 = vle16_v_f16m8(ptr1, vl); - vfloat16m8_t _outp = op(_p, _p1, vl); - vse16_v_f16m8(outptr, _outp, vl); - n -= vl; - ptr += vl; - ptr1 += vl; - outptr += vl; - } -#else - for (int i = 0; i < size; i++) - { - *outptr = op(*ptr, *ptr1); - ptr += 1; - ptr1 += 1; - outptr += 1; - } -#endif -} - -template -static void binary_op_vector_broadcast_b_fp16s(const __fp16* ptr, const __fp16* ptr1, __fp16* outptr, int size, int elempack) -{ - const Op op; - - const __fp16 b = *ptr1; - -#if __riscv_vector - int n = size; - vfloat16m8_t _bx = (elempack == 1) ? vfmv_v_f_f16m8(b, vsetvl_e16m8(n)) : vle16_v_f16m8_f16m1(ptr1); - while (n > 0) - { - size_t vl = vsetvl_e16m8(n); - vfloat16m8_t _p = vle16_v_f16m8(ptr, vl); - vfloat16m8_t _outp = op(_p, _bx, vl); - vse16_v_f16m8(outptr, _outp, vl); - n -= vl; - ptr += vl; - outptr += vl; - } -#else - for (int i = 0; i < size; i++) - { - *outptr = op(*ptr, b); - ptr += 1; - outptr += 1; - } -#endif -} - -template -static void binary_op_vector_broadcast_a_fp16s(const __fp16* ptr, const __fp16* ptr1, __fp16* outptr, int size, int elempack) -{ - const Op op; - - const __fp16 a = *ptr; - -#if __riscv_vector - int n = size; - vfloat16m8_t _ax = (elempack == 1) ? vfmv_v_f_f16m8(a, vsetvl_e16m8(n)) : vle16_v_f16m8_f16m1(ptr); - while (n > 0) - { - size_t vl = vsetvl_e16m8(n); - vfloat16m8_t _p = vle16_v_f16m8(ptr1, vl); - vfloat16m8_t _outp = op(_ax, _p, vl); - vse16_v_f16m8(outptr, _outp, vl); - n -= vl; - ptr1 += vl; - outptr += vl; - } -#else - for (int i = 0; i < size; i++) - { - *outptr = op(a, *ptr1); - ptr1 += 1; - outptr += 1; - } -#endif -} - -template -static void binary_op_vector_broadcast_pb_fp16s(const __fp16* ptr, const __fp16* ptr1, __fp16* outptr, int w, int elempack) -{ - const Op op; - -#if __riscv_vector - // if (elempack == packn) - { - size_t vl = vsetvl_e16m8(elempack); - int i = 0; - for (; i < w; i++) - { - vfloat16m8_t _p = vle16_v_f16m8(ptr, vl); - vfloat16m8_t _outp = op(_p, *ptr1, vl); - vse16_v_f16m8(outptr, _outp, vl); - ptr += vl; - ptr1 += 1; - outptr += vl; - } - } -#endif // __riscv_vector -} - -template -static void binary_op_vector_broadcast_pb_b_fp16s(const __fp16* ptr, const __fp16* ptr1, __fp16* outptr, int w, int elempack) -{ - const Op op; - -#if __riscv_vector - int n = w * elempack; - - vfloat16m8_t _bx = vfmv_v_f_f16m8(*ptr1, vsetvl_e16m8(n)); - while (n > 0) - { - size_t vl = vsetvl_e16m8(n); - vfloat16m8_t _p = vle16_v_f16m8(ptr, vl); - vfloat16m8_t _outp = op(_p, _bx, vl); - vse16_v_f16m8(outptr, _outp, vl); - n -= vl; - ptr += vl; - outptr += vl; - } -#endif // __riscv_vector -} - -template -static void binary_op_vector_broadcast_pb_a_fp16s(const __fp16* ptr, const __fp16* ptr1, __fp16* outptr, int w, int elempack) -{ - const Op op; - -#if __riscv_vector - // if (elempack == packn) - { - size_t vl = vsetvl_e16m8(elempack); - vfloat16m8_t _ax = vle16_v_f16m8_f16m1(ptr); - for (int i = 0; i < w; i++) - { - vfloat16m8_t _outp = op(_ax, *ptr1, vl); - vse16_v_f16m8(outptr, _outp, vl); - ptr1 += 1; - outptr += vl; - } - } -#endif // __riscv_vector -} - -template -static void binary_op_vector_fp16s(const __fp16* ptr, const __fp16* ptr1, __fp16* outptr, int aw, int bw, int ap, int bp) -{ - const int w = std::max(aw, bw); - const int elempack = std::max(ap, bp); - const int size = w * elempack; - - if (ap == bp) - { - if (aw == bw) - { - // no broadcast - return binary_op_vector_no_broadcast_fp16s(ptr, ptr1, outptr, size); - } - - if (bw == 1) - { - // broadcast single b - return binary_op_vector_broadcast_b_fp16s(ptr, ptr1, outptr, size, elempack); - } - - if (aw == 1) - { - // broadcast single a - return binary_op_vector_broadcast_a_fp16s(ptr, ptr1, outptr, size, elempack); - } - } - - if (bp == 1) - { - if (aw == bw) - { - // broadcast pack1 b - return binary_op_vector_broadcast_pb_fp16s(ptr, ptr1, outptr, w, elempack); - } - - if (bw == 1) - { - // broadcast pack1 single b - return binary_op_vector_broadcast_pb_b_fp16s(ptr, ptr1, outptr, w, elempack); - } - - if (aw == 1) - { - // broadcast single a and pack1 b - return binary_op_vector_broadcast_pb_a_fp16s(ptr, ptr1, outptr, w, elempack); - } - } - - // shall never reach here -} - -namespace BinaryOp_riscv_functor { - -#define MAKE_FUNCTION(NAME, IMPL, IMPLVV, IMPLVS, IMPLSV) \ - struct NAME \ - { \ - __fp16 operator()(const __fp16& x, const __fp16& y) const \ - { \ - return IMPL; \ - } \ - vfloat16m8_t operator()(const vfloat16m8_t& x, const vfloat16m8_t& y, const size_t vl) const \ - { \ - return IMPLVV; \ - } \ - vfloat16m8_t operator()(const vfloat16m8_t& x, const __fp16& y, const size_t vl) const \ - { \ - return IMPLVS; \ - } \ - vfloat16m8_t operator()(const __fp16& x, const vfloat16m8_t& y, const size_t vl) const \ - { \ - return IMPLSV; \ - } \ - }; - -// clang-format off -// *INDENT-OFF* -MAKE_FUNCTION(binary_op_add_fp16s, x + y, vfadd_vv_f16m8(x, y, vl), vfadd_vf_f16m8(x, y, vl), vfadd_vf_f16m8(y, x, vl)) -MAKE_FUNCTION(binary_op_sub_fp16s, x - y, vfsub_vv_f16m8(x, y, vl), vfsub_vf_f16m8(x, y, vl), vfrsub_vf_f16m8(y, x, vl)) -MAKE_FUNCTION(binary_op_mul_fp16s, x * y, vfmul_vv_f16m8(x, y, vl), vfmul_vf_f16m8(x, y, vl), vfmul_vf_f16m8(y, x, vl)) -MAKE_FUNCTION(binary_op_div_fp16s, x / y, vfdiv_vv_f16m8(x, y, vl), vfdiv_vf_f16m8(x, y, vl), vfrdiv_vf_f16m8(y, x, vl)) -MAKE_FUNCTION(binary_op_max_fp16s, std::max(x, y), vfmax_vv_f16m8(x, y, vl), vfmax_vf_f16m8(x, y, vl), vfmax_vf_f16m8(y, x, vl)) -MAKE_FUNCTION(binary_op_min_fp16s, std::min(x, y), vfmin_vv_f16m8(x, y, vl), vfmin_vf_f16m8(x, y, vl), vfmin_vf_f16m8(y, x, vl)) -MAKE_FUNCTION(binary_op_pow_fp16s, (__fp16)pow((float)x, (float)y), pow_ps(x, y, vl), pow_ps(x, vfmv_v_f_f16m8(y, vl), vl), pow_ps(vfmv_v_f_f16m8(x, vl), y, vl)) -MAKE_FUNCTION(binary_op_rsub_fp16s, y - x, vfsub_vv_f16m8(y, x, vl), vfrsub_vf_f16m8(x, y, vl), vfsub_vf_f16m8(y, x, vl)) -MAKE_FUNCTION(binary_op_rdiv_fp16s, y / x, vfdiv_vv_f16m8(y, x, vl), vfrdiv_vf_f16m8(x, y, vl), vfdiv_vf_f16m8(y, x, vl)) -MAKE_FUNCTION(binary_op_rpow_fp16s, (__fp16)pow((float)y, (float)x), pow_ps(y, x, vl), pow_ps(vfmv_v_f_f16m8(y, vl), x, vl), pow_ps(y, vfmv_v_f_f16m8(x, vl), vl)) -MAKE_FUNCTION(binary_op_atan2_fp16s, (__fp16)atan2((float)x, (float)y), atan2_ps(x, y, vl), atan2_ps(x, vfmv_v_f_f16m8(y, vl), vl), atan2_ps(vfmv_v_f_f16m8(x, vl), y, vl)) -MAKE_FUNCTION(binary_op_ratan2_fp16s, (__fp16)atan2((float)y, (float)x), atan2_ps(y, x, vl), atan2_ps(vfmv_v_f_f16m8(y, vl), x, vl), atan2_ps(y, vfmv_v_f_f16m8(x, vl), vl)) -// *INDENT-ON* -// clang-format on - -#undef MAKE_FUNCTION - -} // namespace BinaryOp_riscv_functor - -static void binary_op_vector_fp16s(const __fp16* ptr, const __fp16* ptr1, __fp16* outptr, int aw, int bw, int ap, int bp, int op_type) -{ - using namespace BinaryOp_riscv_functor; - - if (op_type == BinaryOp::Operation_ADD) return binary_op_vector_fp16s(ptr, ptr1, outptr, aw, bw, ap, bp); - if (op_type == BinaryOp::Operation_SUB) return binary_op_vector_fp16s(ptr, ptr1, outptr, aw, bw, ap, bp); - if (op_type == BinaryOp::Operation_MUL) return binary_op_vector_fp16s(ptr, ptr1, outptr, aw, bw, ap, bp); - if (op_type == BinaryOp::Operation_DIV) return binary_op_vector_fp16s(ptr, ptr1, outptr, aw, bw, ap, bp); - if (op_type == BinaryOp::Operation_MAX) return binary_op_vector_fp16s(ptr, ptr1, outptr, aw, bw, ap, bp); - if (op_type == BinaryOp::Operation_MIN) return binary_op_vector_fp16s(ptr, ptr1, outptr, aw, bw, ap, bp); - if (op_type == BinaryOp::Operation_POW) return binary_op_vector_fp16s(ptr, ptr1, outptr, aw, bw, ap, bp); - if (op_type == BinaryOp::Operation_RSUB) return binary_op_vector_fp16s(ptr, ptr1, outptr, aw, bw, ap, bp); - if (op_type == BinaryOp::Operation_RDIV) return binary_op_vector_fp16s(ptr, ptr1, outptr, aw, bw, ap, bp); - if (op_type == BinaryOp::Operation_RPOW) return binary_op_vector_fp16s(ptr, ptr1, outptr, aw, bw, ap, bp); - if (op_type == BinaryOp::Operation_ATAN2) return binary_op_vector_fp16s(ptr, ptr1, outptr, aw, bw, ap, bp); - if (op_type == BinaryOp::Operation_RATAN2) return binary_op_vector_fp16s(ptr, ptr1, outptr, aw, bw, ap, bp); - - // should never reach here -} - -static void binary_op_scalar_fp16s(const Mat& a, __fp16 b, Mat& c, int op_type, const Option& opt) -{ - const int channels = a.c; - const int size = a.w * a.h * a.d * a.elempack; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const __fp16* ptr = a.channel(q); - __fp16* outptr = c.channel(q); - - binary_op_vector_fp16s(ptr, &b, outptr, size, 1, 1, 1, op_type); - } -} - -static void binary_op_no_broadcast_fp16s(const Mat& a, const Mat& b, Mat& c, int op_type, const Option& opt) -{ - const int channels = a.c; - const int size = a.w * a.h * a.d * a.elempack; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const __fp16* ptr = a.channel(q); - const __fp16* ptr1 = b.channel(q); - __fp16* outptr = c.channel(q); - - binary_op_vector_fp16s(ptr, ptr1, outptr, size, size, 1, 1, op_type); - } -} - -static void binary_op_broadcast_fp16s(const Mat& a, const Mat& b, Mat& c, int op_type, const Option& opt) -{ - if (b.w * b.h * b.d * b.c * b.elempack == 1) - { - return binary_op_scalar_fp16s(a, ((const __fp16*)b)[0], c, op_type, opt); - } - - if (a.dims == b.dims && a.w == b.w && a.h == b.h && a.d == b.d && a.c == b.c && a.elempack == b.elempack) - { - return binary_op_no_broadcast_fp16s(a, b, c, op_type, opt); - } - - const int dims = c.dims; - - if (dims == 2) - { - const int h = c.h; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < h; y++) - { - const int y0 = std::min(y, a.h - 1); - const int y1 = std::min(y, b.h - 1); - - const __fp16* ptr = a.row(y0); - const __fp16* ptr1 = b.row(y1); - __fp16* outptr = c.row<__fp16>(y); - - binary_op_vector_fp16s(ptr, ptr1, outptr, a.w, b.w, a.elempack, b.elempack, op_type); - } - } - - if (dims == 3 || dims == 4) - { - const int channels = c.c; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const int q0 = std::min(q, a.c - 1); - const int q1 = std::min(q, b.c - 1); - - if (b.d * b.h * b.w == 1) - { - const __fp16* ptr = a.channel(q0); - const __fp16* ptr1 = b.channel(q1); - __fp16* outptr = c.channel(q); - - binary_op_vector_fp16s(ptr, ptr1, outptr, a.w * a.h * a.d, 1, a.elempack, b.elempack, op_type); - continue; - } - - if (b.h * b.w == 1) - { - for (int z = 0; z < c.d; z++) - { - const int z0 = std::min(z, a.d - 1); - const int z1 = std::min(z, b.d - 1); - - const __fp16* ptr = a.channel(q0).depth(z0); - const __fp16* ptr1 = b.channel(q1).depth(z1); - __fp16* outptr = c.channel(q).depth(z); - - binary_op_vector_fp16s(ptr, ptr1, outptr, a.w * a.h, 1, a.elempack, b.elempack, op_type); - } - continue; - } - - for (int z = 0; z < c.d; z++) - { - const int z0 = std::min(z, a.d - 1); - const int z1 = std::min(z, b.d - 1); - - for (int y = 0; y < c.h; y++) - { - const int y0 = std::min(y, a.h - 1); - const int y1 = std::min(y, b.h - 1); - - const __fp16* ptr = a.channel(q0).depth(z0).row(y0); - const __fp16* ptr1 = b.channel(q1).depth(z1).row(y1); - __fp16* outptr = c.channel(q).depth(z).row<__fp16>(y); - - binary_op_vector_fp16s(ptr, ptr1, outptr, a.w, b.w, a.elempack, b.elempack, op_type); - } - } - } - } -} - -static void binary_op_scalar_inplace_fp16s(Mat& a, __fp16 b, int op_type, const Option& opt) -{ - const int channels = a.c; - const int size = a.w * a.h * a.d * a.elempack; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - __fp16* ptr = a.channel(q); - - binary_op_vector_fp16s(ptr, &b, ptr, size, 1, 1, 1, op_type); - } -} - -int BinaryOp_riscv::forward_fp16s(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const -{ - const Mat& A = bottom_blobs[0]; - const Mat& B = bottom_blobs[1]; - const int outdims = std::max(A.dims, B.dims); - - Mat A2 = A; - Mat B2 = B; - if (A.dims < outdims) - { - // expand inner axes - if (outdims == 2) - { - if (A.w * A.elempack == B.h * B.elempack) - A2 = A.reshape(1, A.w, opt.workspace_allocator); - else // if (A.w == B.w) - { - A2.dims = 2; - A2.w = A.w * A.elempack; - A2.elempack = 1; - A2.elemsize = A.elemsize / A.elempack; - A2.cstep = A2.w; - } - } - if (outdims == 3 && A.dims == 1) - { - if (A.w * A.elempack == B.c * B.elempack) - A2 = A.reshape(1, 1, A.w, opt.workspace_allocator); - else // if (A.w == B.w) - { - A2.dims = 3; - A2.w = A.w * A.elempack; - A2.elempack = 1; - A2.elemsize = A.elemsize / A.elempack; - A2.cstep = A2.w; - } - } - if (outdims == 3 && A.dims == 2) - A2 = A.reshape(1, A.w, A.h, opt.workspace_allocator); - if (outdims == 4 && A.dims == 1) - { - if (A.w * A.elempack == B.c * B.elempack) - A2 = A.reshape(1, 1, 1, A.w, opt.workspace_allocator); - else // if (A.w == B.w) - { - A2.dims = 4; - A2.w = A.w * A.elempack; - A2.elempack = 1; - A2.elemsize = A.elemsize / A.elempack; - A2.cstep = A2.w; - } - } - if (outdims == 4 && A.dims == 2) - A2 = A.reshape(1, 1, A.w, A.h, opt.workspace_allocator); - if (outdims == 4 && A.dims == 3) - A2 = A.reshape(1, A.w, A.h, A.c, opt.workspace_allocator); - } - if (B.dims < outdims) - { - // expand inner axes - if (outdims == 2) - { - if (B.w * B.elempack == A.h * A.elempack) - B2 = B.reshape(1, B.w, opt.workspace_allocator); - else // if (B.w == A.w) - { - B2.dims = 2; - B2.w = B.w * B.elempack; - B2.elempack = 1; - B2.elemsize = B.elemsize / B.elempack; - B2.cstep = B2.w; - } - } - if (outdims == 3 && B.dims == 1) - { - if (B.w * B.elempack == A.c * A.elempack) - B2 = B.reshape(1, 1, B.w, opt.workspace_allocator); - else // if (B.w == A.w) - { - B2.dims = 3; - B2.w = B.w * B.elempack; - B2.elempack = 1; - B2.elemsize = B.elemsize / B.elempack; - B2.cstep = B2.w; - } - } - if (outdims == 3 && B.dims == 2) - B2 = B.reshape(1, B.w, B.h, opt.workspace_allocator); - if (outdims == 4 && B.dims == 1) - { - if (B.w * B.elempack == A.c * A.elempack) - B2 = B.reshape(1, 1, 1, B.w, opt.workspace_allocator); - else // if (B.w == A.w) - { - B2.dims = 4; - B2.w = B.w * B.elempack; - B2.elempack = 1; - B2.elemsize = B.elemsize / B.elempack; - B2.cstep = B2.w; - } - } - if (outdims == 4 && B.dims == 2) - B2 = B.reshape(1, 1, B.w, B.h, opt.workspace_allocator); - if (outdims == 4 && B.dims == 3) - B2 = B.reshape(1, B.w, B.h, B.c, opt.workspace_allocator); - } - - const int outw = std::max(A2.w, B2.w); - const int outh = std::max(A2.h, B2.h); - const int outd = std::max(A2.d, B2.d); - const int outc = std::max(A2.c, B2.c); - const size_t out_elemsize = std::max(A2.elemsize, B2.elemsize); - const int out_elempack = std::max(A2.elempack, B2.elempack); - - Mat& top_blob = top_blobs[0]; - if (outdims == 1) - { - top_blob.create(outw, out_elemsize, out_elempack, opt.blob_allocator); - } - if (outdims == 2) - { - top_blob.create(outw, outh, out_elemsize, out_elempack, opt.blob_allocator); - } - if (outdims == 3) - { - top_blob.create(outw, outh, outc, out_elemsize, out_elempack, opt.blob_allocator); - } - if (outdims == 4) - { - top_blob.create(outw, outh, outd, outc, out_elemsize, out_elempack, opt.blob_allocator); - } - if (top_blob.empty()) - return -100; - - const bool a_pack_is_lower = A2.elempack < B2.elempack; - const bool a_pack_is_equal = A2.elempack == B2.elempack; - const bool a_size_is_lower = A2.w * A2.h * A2.d * A2.c * A2.elempack < B2.w * B2.h * B2.d * B2.c * B2.elempack; - if (a_pack_is_lower || (a_pack_is_equal && a_size_is_lower)) - { - binary_op_broadcast_fp16s(B2, A2, top_blob, get_reverse_op_type(op_type), opt); - } - else - { - binary_op_broadcast_fp16s(A2, B2, top_blob, op_type, opt); - } - - return 0; -} - -int BinaryOp_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const -{ - binary_op_scalar_inplace_fp16s(bottom_top_blob, b, op_type, opt); - - return 0; -} -#endif // __riscv_vector && __riscv_zfh - } // namespace ncnn diff --git a/src/layer/riscv/binaryop_riscv.h b/src/layer/riscv/binaryop_riscv.h index afc728b6e68..9a2d473c51f 100644 --- a/src/layer/riscv/binaryop_riscv.h +++ b/src/layer/riscv/binaryop_riscv.h @@ -31,7 +31,7 @@ class BinaryOp_riscv : public BinaryOp virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; protected: -#if __riscv_vector && __riscv_zfh +#if NCNN_ZFH int forward_fp16s(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const; #endif diff --git a/src/layer/riscv/binaryop_riscv_zfh.cpp b/src/layer/riscv/binaryop_riscv_zfh.cpp new file mode 100644 index 00000000000..556a666d5a9 --- /dev/null +++ b/src/layer/riscv/binaryop_riscv_zfh.cpp @@ -0,0 +1,616 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "binaryop_riscv.h" + +#if __riscv_vector +#include +#include "rvv_mathfun.h" +#include "riscv_usability.h" +#if __riscv_zvfh +#include "rvv_mathfun_fp16s.h" +#endif +#endif // __riscv_vector + +namespace ncnn { + +#if NCNN_ZFH +template +static void binary_op_vector_no_broadcast_fp16s(const __fp16* ptr, const __fp16* ptr1, __fp16* outptr, int size) +{ + const Op op; + +#if __riscv_zvfh + int n = size; + while (n > 0) + { + size_t vl = __riscv_vsetvl_e16m8(n); + vfloat16m8_t _p = __riscv_vle16_v_f16m8(ptr, vl); + vfloat16m8_t _p1 = __riscv_vle16_v_f16m8(ptr1, vl); + vfloat16m8_t _outp = op(_p, _p1, vl); + __riscv_vse16_v_f16m8(outptr, _outp, vl); + n -= vl; + ptr += vl; + ptr1 += vl; + outptr += vl; + } +#else // __riscv_zvfh + for (int i = 0; i < size; i++) + { + *outptr = op(*ptr, *ptr1); + ptr += 1; + ptr1 += 1; + outptr += 1; + } +#endif // __riscv_zvfh +} + +template +static void binary_op_vector_broadcast_b_fp16s(const __fp16* ptr, const __fp16* ptr1, __fp16* outptr, int size, int elempack) +{ + const Op op; + + const __fp16 b = *ptr1; + +#if __riscv_zvfh + int n = size; + vfloat16m8_t _bx = (elempack == 1) ? __riscv_vfmv_v_f_f16m8(b, __riscv_vsetvl_e16m8(n)) : __riscv_vle16_v_f16m8_f16m1(ptr1); + while (n > 0) + { + size_t vl = __riscv_vsetvl_e16m8(n); + vfloat16m8_t _p = __riscv_vle16_v_f16m8(ptr, vl); + vfloat16m8_t _outp = op(_p, _bx, vl); + __riscv_vse16_v_f16m8(outptr, _outp, vl); + n -= vl; + ptr += vl; + outptr += vl; + } +#else // __riscv_zvfh + for (int i = 0; i < size; i++) + { + *outptr = op(*ptr, b); + ptr += 1; + outptr += 1; + } +#endif // __riscv_zvfh +} + +template +static void binary_op_vector_broadcast_a_fp16s(const __fp16* ptr, const __fp16* ptr1, __fp16* outptr, int size, int elempack) +{ + const Op op; + + const __fp16 a = *ptr; + +#if __riscv_zvfh + int n = size; + vfloat16m8_t _ax = (elempack == 1) ? __riscv_vfmv_v_f_f16m8(a, __riscv_vsetvl_e16m8(n)) : __riscv_vle16_v_f16m8_f16m1(ptr); + while (n > 0) + { + size_t vl = __riscv_vsetvl_e16m8(n); + vfloat16m8_t _p = __riscv_vle16_v_f16m8(ptr1, vl); + vfloat16m8_t _outp = op(_ax, _p, vl); + __riscv_vse16_v_f16m8(outptr, _outp, vl); + n -= vl; + ptr1 += vl; + outptr += vl; + } +#else // __riscv_zvfh + for (int i = 0; i < size; i++) + { + *outptr = op(a, *ptr1); + ptr1 += 1; + outptr += 1; + } +#endif // __riscv_zvfh +} + +template +static void binary_op_vector_broadcast_pb_fp16s(const __fp16* ptr, const __fp16* ptr1, __fp16* outptr, int w, int elempack) +{ + const Op op; + +#if __riscv_zvfh + // if (elempack == packn) + { + size_t vl = __riscv_vsetvl_e16m8(elempack); + int i = 0; + for (; i < w; i++) + { + vfloat16m8_t _p = __riscv_vle16_v_f16m8(ptr, vl); + vfloat16m8_t _outp = op(_p, *ptr1, vl); + __riscv_vse16_v_f16m8(outptr, _outp, vl); + ptr += vl; + ptr1 += 1; + outptr += vl; + } + } +#endif // __riscv_zvfh +} + +template +static void binary_op_vector_broadcast_pb_b_fp16s(const __fp16* ptr, const __fp16* ptr1, __fp16* outptr, int w, int elempack) +{ + const Op op; + +#if __riscv_zvfh + int n = w * elempack; + + vfloat16m8_t _bx = __riscv_vfmv_v_f_f16m8(*ptr1, __riscv_vsetvl_e16m8(n)); + while (n > 0) + { + size_t vl = __riscv_vsetvl_e16m8(n); + vfloat16m8_t _p = __riscv_vle16_v_f16m8(ptr, vl); + vfloat16m8_t _outp = op(_p, _bx, vl); + __riscv_vse16_v_f16m8(outptr, _outp, vl); + n -= vl; + ptr += vl; + outptr += vl; + } +#endif // __riscv_zvfh +} + +template +static void binary_op_vector_broadcast_pb_a_fp16s(const __fp16* ptr, const __fp16* ptr1, __fp16* outptr, int w, int elempack) +{ + const Op op; + +#if __riscv_zvfh + // if (elempack == packn) + { + size_t vl = __riscv_vsetvl_e16m8(elempack); + vfloat16m8_t _ax = __riscv_vle16_v_f16m8_f16m1(ptr); + for (int i = 0; i < w; i++) + { + vfloat16m8_t _outp = op(_ax, *ptr1, vl); + __riscv_vse16_v_f16m8(outptr, _outp, vl); + ptr1 += 1; + outptr += vl; + } + } +#endif // __riscv_zvfh +} + +template +static void binary_op_vector_fp16s(const __fp16* ptr, const __fp16* ptr1, __fp16* outptr, int aw, int bw, int ap, int bp) +{ + const int w = std::max(aw, bw); + const int elempack = std::max(ap, bp); + const int size = w * elempack; + + if (ap == bp) + { + if (aw == bw) + { + // no broadcast + return binary_op_vector_no_broadcast_fp16s(ptr, ptr1, outptr, size); + } + + if (bw == 1) + { + // broadcast single b + return binary_op_vector_broadcast_b_fp16s(ptr, ptr1, outptr, size, elempack); + } + + if (aw == 1) + { + // broadcast single a + return binary_op_vector_broadcast_a_fp16s(ptr, ptr1, outptr, size, elempack); + } + } + + if (bp == 1) + { + if (aw == bw) + { + // broadcast pack1 b + return binary_op_vector_broadcast_pb_fp16s(ptr, ptr1, outptr, w, elempack); + } + + if (bw == 1) + { + // broadcast pack1 single b + return binary_op_vector_broadcast_pb_b_fp16s(ptr, ptr1, outptr, w, elempack); + } + + if (aw == 1) + { + // broadcast single a and pack1 b + return binary_op_vector_broadcast_pb_a_fp16s(ptr, ptr1, outptr, w, elempack); + } + } + + // shall never reach here +} + +namespace BinaryOp_riscv_functor { + +#if __riscv_zvfh +#define MAKE_FUNCTION(NAME, IMPL, IMPLVV, IMPLVS, IMPLSV) \ + struct NAME \ + { \ + __fp16 operator()(const __fp16& x, const __fp16& y) const \ + { \ + return IMPL; \ + } \ + vfloat16m8_t operator()(const vfloat16m8_t& x, const vfloat16m8_t& y, const size_t vl) const \ + { \ + return IMPLVV; \ + } \ + vfloat16m8_t operator()(const vfloat16m8_t& x, const __fp16& y, const size_t vl) const \ + { \ + return IMPLVS; \ + } \ + vfloat16m8_t operator()(const __fp16& x, const vfloat16m8_t& y, const size_t vl) const \ + { \ + return IMPLSV; \ + } \ + }; +#else +#define MAKE_FUNCTION(NAME, IMPL, IMPLVV, IMPLVS, IMPLSV) \ + struct NAME \ + { \ + __fp16 operator()(const __fp16& x, const __fp16& y) const \ + { \ + return IMPL; \ + } \ + }; +#endif + +// clang-format off +// *INDENT-OFF* +MAKE_FUNCTION(binary_op_add_fp16s, x + y, __riscv_vfadd_vv_f16m8(x, y, vl), __riscv_vfadd_vf_f16m8(x, y, vl), __riscv_vfadd_vf_f16m8(y, x, vl)) +MAKE_FUNCTION(binary_op_sub_fp16s, x - y, __riscv_vfsub_vv_f16m8(x, y, vl), __riscv_vfsub_vf_f16m8(x, y, vl), __riscv_vfrsub_vf_f16m8(y, x, vl)) +MAKE_FUNCTION(binary_op_mul_fp16s, x * y, __riscv_vfmul_vv_f16m8(x, y, vl), __riscv_vfmul_vf_f16m8(x, y, vl), __riscv_vfmul_vf_f16m8(y, x, vl)) +MAKE_FUNCTION(binary_op_div_fp16s, x / y, __riscv_vfdiv_vv_f16m8(x, y, vl), __riscv_vfdiv_vf_f16m8(x, y, vl), __riscv_vfrdiv_vf_f16m8(y, x, vl)) +MAKE_FUNCTION(binary_op_max_fp16s, std::max(x, y), __riscv_vfmax_vv_f16m8(x, y, vl), __riscv_vfmax_vf_f16m8(x, y, vl), __riscv_vfmax_vf_f16m8(y, x, vl)) +MAKE_FUNCTION(binary_op_min_fp16s, std::min(x, y), __riscv_vfmin_vv_f16m8(x, y, vl), __riscv_vfmin_vf_f16m8(x, y, vl), __riscv_vfmin_vf_f16m8(y, x, vl)) +MAKE_FUNCTION(binary_op_pow_fp16s, (__fp16)pow((float)x, (float)y), pow_ps(x, y, vl), pow_ps(x, __riscv_vfmv_v_f_f16m8(y, vl), vl), pow_ps(__riscv_vfmv_v_f_f16m8(x, vl), y, vl)) +MAKE_FUNCTION(binary_op_rsub_fp16s, y - x, __riscv_vfsub_vv_f16m8(y, x, vl), __riscv_vfrsub_vf_f16m8(x, y, vl), __riscv_vfsub_vf_f16m8(y, x, vl)) +MAKE_FUNCTION(binary_op_rdiv_fp16s, y / x, __riscv_vfdiv_vv_f16m8(y, x, vl), __riscv_vfrdiv_vf_f16m8(x, y, vl), __riscv_vfdiv_vf_f16m8(y, x, vl)) +MAKE_FUNCTION(binary_op_rpow_fp16s, (__fp16)pow((float)y, (float)x), pow_ps(y, x, vl), pow_ps(__riscv_vfmv_v_f_f16m8(y, vl), x, vl), pow_ps(y, __riscv_vfmv_v_f_f16m8(x, vl), vl)) +MAKE_FUNCTION(binary_op_atan2_fp16s, (__fp16)atan2((float)x, (float)y), atan2_ps(x, y, vl), atan2_ps(x, __riscv_vfmv_v_f_f16m8(y, vl), vl), atan2_ps(__riscv_vfmv_v_f_f16m8(x, vl), y, vl)) +MAKE_FUNCTION(binary_op_ratan2_fp16s, (__fp16)atan2((float)y, (float)x), atan2_ps(y, x, vl), atan2_ps(__riscv_vfmv_v_f_f16m8(y, vl), x, vl), atan2_ps(y, __riscv_vfmv_v_f_f16m8(x, vl), vl)) +// *INDENT-ON* +// clang-format on + +#undef MAKE_FUNCTION + +} // namespace BinaryOp_riscv_functor + +static void binary_op_vector_fp16s(const __fp16* ptr, const __fp16* ptr1, __fp16* outptr, int aw, int bw, int ap, int bp, int op_type) +{ + using namespace BinaryOp_riscv_functor; + + if (op_type == BinaryOp::Operation_ADD) return binary_op_vector_fp16s(ptr, ptr1, outptr, aw, bw, ap, bp); + if (op_type == BinaryOp::Operation_SUB) return binary_op_vector_fp16s(ptr, ptr1, outptr, aw, bw, ap, bp); + if (op_type == BinaryOp::Operation_MUL) return binary_op_vector_fp16s(ptr, ptr1, outptr, aw, bw, ap, bp); + if (op_type == BinaryOp::Operation_DIV) return binary_op_vector_fp16s(ptr, ptr1, outptr, aw, bw, ap, bp); + if (op_type == BinaryOp::Operation_MAX) return binary_op_vector_fp16s(ptr, ptr1, outptr, aw, bw, ap, bp); + if (op_type == BinaryOp::Operation_MIN) return binary_op_vector_fp16s(ptr, ptr1, outptr, aw, bw, ap, bp); + if (op_type == BinaryOp::Operation_POW) return binary_op_vector_fp16s(ptr, ptr1, outptr, aw, bw, ap, bp); + if (op_type == BinaryOp::Operation_RSUB) return binary_op_vector_fp16s(ptr, ptr1, outptr, aw, bw, ap, bp); + if (op_type == BinaryOp::Operation_RDIV) return binary_op_vector_fp16s(ptr, ptr1, outptr, aw, bw, ap, bp); + if (op_type == BinaryOp::Operation_RPOW) return binary_op_vector_fp16s(ptr, ptr1, outptr, aw, bw, ap, bp); + if (op_type == BinaryOp::Operation_ATAN2) return binary_op_vector_fp16s(ptr, ptr1, outptr, aw, bw, ap, bp); + if (op_type == BinaryOp::Operation_RATAN2) return binary_op_vector_fp16s(ptr, ptr1, outptr, aw, bw, ap, bp); + + // should never reach here +} + +static void binary_op_scalar_fp16s(const Mat& a, __fp16 b, Mat& c, int op_type, const Option& opt) +{ + const int channels = a.c; + const int size = a.w * a.h * a.d * a.elempack; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const __fp16* ptr = a.channel(q); + __fp16* outptr = c.channel(q); + + binary_op_vector_fp16s(ptr, &b, outptr, size, 1, 1, 1, op_type); + } +} + +static void binary_op_no_broadcast_fp16s(const Mat& a, const Mat& b, Mat& c, int op_type, const Option& opt) +{ + const int channels = a.c; + const int size = a.w * a.h * a.d * a.elempack; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const __fp16* ptr = a.channel(q); + const __fp16* ptr1 = b.channel(q); + __fp16* outptr = c.channel(q); + + binary_op_vector_fp16s(ptr, ptr1, outptr, size, size, 1, 1, op_type); + } +} + +static void binary_op_broadcast_fp16s(const Mat& a, const Mat& b, Mat& c, int op_type, const Option& opt) +{ + if (b.w * b.h * b.d * b.c * b.elempack == 1) + { + return binary_op_scalar_fp16s(a, ((const __fp16*)b)[0], c, op_type, opt); + } + + if (a.dims == b.dims && a.w == b.w && a.h == b.h && a.d == b.d && a.c == b.c && a.elempack == b.elempack) + { + return binary_op_no_broadcast_fp16s(a, b, c, op_type, opt); + } + + const int dims = c.dims; + + if (dims == 2) + { + const int h = c.h; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < h; y++) + { + const int y0 = std::min(y, a.h - 1); + const int y1 = std::min(y, b.h - 1); + + const __fp16* ptr = a.row(y0); + const __fp16* ptr1 = b.row(y1); + __fp16* outptr = c.row<__fp16>(y); + + binary_op_vector_fp16s(ptr, ptr1, outptr, a.w, b.w, a.elempack, b.elempack, op_type); + } + } + + if (dims == 3 || dims == 4) + { + const int channels = c.c; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int q0 = std::min(q, a.c - 1); + const int q1 = std::min(q, b.c - 1); + + if (b.d * b.h * b.w == 1) + { + const __fp16* ptr = a.channel(q0); + const __fp16* ptr1 = b.channel(q1); + __fp16* outptr = c.channel(q); + + binary_op_vector_fp16s(ptr, ptr1, outptr, a.w * a.h * a.d, 1, a.elempack, b.elempack, op_type); + continue; + } + + if (b.h * b.w == 1) + { + for (int z = 0; z < c.d; z++) + { + const int z0 = std::min(z, a.d - 1); + const int z1 = std::min(z, b.d - 1); + + const __fp16* ptr = a.channel(q0).depth(z0); + const __fp16* ptr1 = b.channel(q1).depth(z1); + __fp16* outptr = c.channel(q).depth(z); + + binary_op_vector_fp16s(ptr, ptr1, outptr, a.w * a.h, 1, a.elempack, b.elempack, op_type); + } + continue; + } + + for (int z = 0; z < c.d; z++) + { + const int z0 = std::min(z, a.d - 1); + const int z1 = std::min(z, b.d - 1); + + for (int y = 0; y < c.h; y++) + { + const int y0 = std::min(y, a.h - 1); + const int y1 = std::min(y, b.h - 1); + + const __fp16* ptr = a.channel(q0).depth(z0).row(y0); + const __fp16* ptr1 = b.channel(q1).depth(z1).row(y1); + __fp16* outptr = c.channel(q).depth(z).row<__fp16>(y); + + binary_op_vector_fp16s(ptr, ptr1, outptr, a.w, b.w, a.elempack, b.elempack, op_type); + } + } + } + } +} + +static void binary_op_scalar_inplace_fp16s(Mat& a, __fp16 b, int op_type, const Option& opt) +{ + const int channels = a.c; + const int size = a.w * a.h * a.d * a.elempack; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + __fp16* ptr = a.channel(q); + + binary_op_vector_fp16s(ptr, &b, ptr, size, 1, 1, 1, op_type); + } +} + +static int get_reverse_op_type(int op_type) +{ + if (op_type == BinaryOp::Operation_SUB) return BinaryOp::Operation_RSUB; + if (op_type == BinaryOp::Operation_DIV) return BinaryOp::Operation_RDIV; + if (op_type == BinaryOp::Operation_POW) return BinaryOp::Operation_RPOW; + if (op_type == BinaryOp::Operation_ATAN2) return BinaryOp::Operation_RATAN2; + if (op_type == BinaryOp::Operation_RSUB) return BinaryOp::Operation_SUB; + if (op_type == BinaryOp::Operation_RDIV) return BinaryOp::Operation_DIV; + if (op_type == BinaryOp::Operation_RPOW) return BinaryOp::Operation_POW; + if (op_type == BinaryOp::Operation_RATAN2) return BinaryOp::Operation_ATAN2; + return op_type; +} + +int BinaryOp_riscv::forward_fp16s(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const +{ + const Mat& A = bottom_blobs[0]; + const Mat& B = bottom_blobs[1]; + const int outdims = std::max(A.dims, B.dims); + + Mat A2 = A; + Mat B2 = B; + if (A.dims < outdims) + { + // expand inner axes + if (outdims == 2) + { + if (A.w * A.elempack == B.h * B.elempack) + A2 = A.reshape(1, A.w, opt.workspace_allocator); + else // if (A.w == B.w) + { + A2.dims = 2; + A2.w = A.w * A.elempack; + A2.elempack = 1; + A2.elemsize = A.elemsize / A.elempack; + A2.cstep = A2.w; + } + } + if (outdims == 3 && A.dims == 1) + { + if (A.w * A.elempack == B.c * B.elempack) + A2 = A.reshape(1, 1, A.w, opt.workspace_allocator); + else // if (A.w == B.w) + { + A2.dims = 3; + A2.w = A.w * A.elempack; + A2.elempack = 1; + A2.elemsize = A.elemsize / A.elempack; + A2.cstep = A2.w; + } + } + if (outdims == 3 && A.dims == 2) + A2 = A.reshape(1, A.w, A.h, opt.workspace_allocator); + if (outdims == 4 && A.dims == 1) + { + if (A.w * A.elempack == B.c * B.elempack) + A2 = A.reshape(1, 1, 1, A.w, opt.workspace_allocator); + else // if (A.w == B.w) + { + A2.dims = 4; + A2.w = A.w * A.elempack; + A2.elempack = 1; + A2.elemsize = A.elemsize / A.elempack; + A2.cstep = A2.w; + } + } + if (outdims == 4 && A.dims == 2) + A2 = A.reshape(1, 1, A.w, A.h, opt.workspace_allocator); + if (outdims == 4 && A.dims == 3) + A2 = A.reshape(1, A.w, A.h, A.c, opt.workspace_allocator); + } + if (B.dims < outdims) + { + // expand inner axes + if (outdims == 2) + { + if (B.w * B.elempack == A.h * A.elempack) + B2 = B.reshape(1, B.w, opt.workspace_allocator); + else // if (B.w == A.w) + { + B2.dims = 2; + B2.w = B.w * B.elempack; + B2.elempack = 1; + B2.elemsize = B.elemsize / B.elempack; + B2.cstep = B2.w; + } + } + if (outdims == 3 && B.dims == 1) + { + if (B.w * B.elempack == A.c * A.elempack) + B2 = B.reshape(1, 1, B.w, opt.workspace_allocator); + else // if (B.w == A.w) + { + B2.dims = 3; + B2.w = B.w * B.elempack; + B2.elempack = 1; + B2.elemsize = B.elemsize / B.elempack; + B2.cstep = B2.w; + } + } + if (outdims == 3 && B.dims == 2) + B2 = B.reshape(1, B.w, B.h, opt.workspace_allocator); + if (outdims == 4 && B.dims == 1) + { + if (B.w * B.elempack == A.c * A.elempack) + B2 = B.reshape(1, 1, 1, B.w, opt.workspace_allocator); + else // if (B.w == A.w) + { + B2.dims = 4; + B2.w = B.w * B.elempack; + B2.elempack = 1; + B2.elemsize = B.elemsize / B.elempack; + B2.cstep = B2.w; + } + } + if (outdims == 4 && B.dims == 2) + B2 = B.reshape(1, 1, B.w, B.h, opt.workspace_allocator); + if (outdims == 4 && B.dims == 3) + B2 = B.reshape(1, B.w, B.h, B.c, opt.workspace_allocator); + } + + const int outw = std::max(A2.w, B2.w); + const int outh = std::max(A2.h, B2.h); + const int outd = std::max(A2.d, B2.d); + const int outc = std::max(A2.c, B2.c); + const size_t out_elemsize = std::max(A2.elemsize, B2.elemsize); + const int out_elempack = std::max(A2.elempack, B2.elempack); + + Mat& top_blob = top_blobs[0]; + if (outdims == 1) + { + top_blob.create(outw, out_elemsize, out_elempack, opt.blob_allocator); + } + if (outdims == 2) + { + top_blob.create(outw, outh, out_elemsize, out_elempack, opt.blob_allocator); + } + if (outdims == 3) + { + top_blob.create(outw, outh, outc, out_elemsize, out_elempack, opt.blob_allocator); + } + if (outdims == 4) + { + top_blob.create(outw, outh, outd, outc, out_elemsize, out_elempack, opt.blob_allocator); + } + if (top_blob.empty()) + return -100; + + const bool a_pack_is_lower = A2.elempack < B2.elempack; + const bool a_pack_is_equal = A2.elempack == B2.elempack; + const bool a_size_is_lower = A2.w * A2.h * A2.d * A2.c * A2.elempack < B2.w * B2.h * B2.d * B2.c * B2.elempack; + if (a_pack_is_lower || (a_pack_is_equal && a_size_is_lower)) + { + binary_op_broadcast_fp16s(B2, A2, top_blob, get_reverse_op_type(op_type), opt); + } + else + { + binary_op_broadcast_fp16s(A2, B2, top_blob, op_type, opt); + } + + return 0; +} + +int BinaryOp_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const +{ + binary_op_scalar_inplace_fp16s(bottom_top_blob, (__fp16)b, op_type, opt); + + return 0; +} +#endif // NCNN_ZFH + +} // namespace ncnn diff --git a/src/layer/riscv/cast_riscv.cpp b/src/layer/riscv/cast_riscv.cpp index 5d0642e7da7..9b01dd5a91a 100644 --- a/src/layer/riscv/cast_riscv.cpp +++ b/src/layer/riscv/cast_riscv.cpp @@ -18,15 +18,14 @@ #include #endif // __riscv_vector +#include "cpu.h" + namespace ncnn { Cast_riscv::Cast_riscv() { #if __riscv_vector support_packing = true; -#if __riscv_zfh - support_fp16_storage = true; -#endif #endif // __riscv_vector } @@ -89,55 +88,33 @@ int Cast_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt int size = w * h * d * elempack; -#if __riscv_vector && __riscv_zfh +#if NCNN_ZFH if (type_from == 1 && type_to == 2) { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) +#if __riscv_vector + if (cpu_support_riscv_zvfh()) +#else + if (cpu_support_riscv_zfh()) +#endif { - const float* ptr = bottom_blob.channel(q); - __fp16* outptr = top_blob.channel(q); - - int n = size; - while (n > 0) - { - size_t vl = vsetvl_e32m8(n); - - vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); - vfloat16m4_t _outp = vfncvt_f_f_w_f16m4(_p, vl); - vse16_v_f16m4(outptr, _outp, vl); - - ptr += vl; - outptr += vl; - n -= vl; - } + cast_fp32_to_fp16(bottom_blob, top_blob, opt); + return 0; } } if (type_from == 2 && type_to == 1) { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) +#if __riscv_vector + if (cpu_support_riscv_zvfh()) +#else + if (cpu_support_riscv_zfh()) +#endif { - const __fp16* ptr = bottom_blob.channel(q); - float* outptr = top_blob.channel(q); - - int n = size; - while (n > 0) - { - size_t vl = vsetvl_e16m4(n); - - vfloat16m4_t _p = vle16_v_f16m4(ptr, vl); - vfloat32m8_t _outp = vfwcvt_f_f_v_f32m8(_p, vl); - vse32_v_f32m8(outptr, _outp, vl); - - ptr += vl; - outptr += vl; - n -= vl; - } + cast_fp16_to_fp32(bottom_blob, top_blob, opt); + return 0; } } -#endif // __riscv_vector && __riscv_zfh +#endif // NCNN_ZFH if (type_from == 3 && type_to == 1) { @@ -152,6 +129,8 @@ int Cast_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt outptr[i] = (float)ptr[i]; } } + + return 0; } // TODO more cast type diff --git a/src/layer/riscv/cast_riscv.h b/src/layer/riscv/cast_riscv.h index 7c6fbb6d4ce..4d4d4d3b424 100644 --- a/src/layer/riscv/cast_riscv.h +++ b/src/layer/riscv/cast_riscv.h @@ -25,6 +25,11 @@ class Cast_riscv : public Cast Cast_riscv(); virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; + +#if NCNN_ZFH + void cast_fp32_to_fp16(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; + void cast_fp16_to_fp32(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; +#endif }; } // namespace ncnn diff --git a/src/layer/riscv/cast_riscv_zfh.cpp b/src/layer/riscv/cast_riscv_zfh.cpp new file mode 100644 index 00000000000..cb95eebf924 --- /dev/null +++ b/src/layer/riscv/cast_riscv_zfh.cpp @@ -0,0 +1,113 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "cast_riscv.h" + +namespace ncnn { + +void Cast_riscv::cast_fp32_to_fp16(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ + const int w = bottom_blob.w; + const int h = bottom_blob.h; + const int d = bottom_blob.d; + const int channels = bottom_blob.c; + const int elempack = bottom_blob.elempack; + + const int size = w * h * d * elempack; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = bottom_blob.channel(q); +#if __riscv_zfh + __fp16* outptr = top_blob.channel(q); +#else + unsigned short* outptr = top_blob.channel(q); +#endif + +#if __riscv_zvfh + int n = size; + while (n > 0) + { + size_t vl = __riscv_vsetvl_e32m8(n); + + vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr, vl); + vfloat16m4_t _outp = __riscv_vfncvt_f_f_w_f16m4(_p, vl); + __riscv_vse16_v_f16m4(outptr, _outp, vl); + + ptr += vl; + outptr += vl; + n -= vl; + } +#else // __riscv_zvfh + for (int i = 0; i < size; i++) + { +#if __riscv_zfh + *outptr++ = (__fp16)(*ptr++); +#else + *outptr++ = float32_to_float16(*ptr++); +#endif + } +#endif // __riscv_zvfh + } +} + +void Cast_riscv::cast_fp16_to_fp32(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ + const int w = bottom_blob.w; + const int h = bottom_blob.h; + const int d = bottom_blob.d; + const int channels = bottom_blob.c; + const int elempack = bottom_blob.elempack; + + const int size = w * h * d * elempack; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { +#if __riscv_zfh + const __fp16* ptr = bottom_blob.channel(q); +#else + const unsigned short* ptr = bottom_blob.channel(q); +#endif + float* outptr = top_blob.channel(q); + +#if __riscv_zvfh + int n = size; + while (n > 0) + { + size_t vl = __riscv_vsetvl_e16m4(n); + + vfloat16m4_t _p = __riscv_vle16_v_f16m4(ptr, vl); + vfloat32m8_t _outp = __riscv_vfwcvt_f_f_v_f32m8(_p, vl); + __riscv_vse32_v_f32m8(outptr, _outp, vl); + + ptr += vl; + outptr += vl; + n -= vl; + } +#else // __riscv_zvfh + for (int i = 0; i < size; i++) + { +#if __riscv_zfh + *outptr++ = (float)(*ptr++); +#else + *outptr++ = float16_to_float32(*ptr++); +#endif + } +#endif // __riscv_zvfh + } +} + +} // namespace ncnn diff --git a/src/layer/riscv/clip_riscv.cpp b/src/layer/riscv/clip_riscv.cpp index 8c43e06a4d8..52a82fe9b81 100644 --- a/src/layer/riscv/clip_riscv.cpp +++ b/src/layer/riscv/clip_riscv.cpp @@ -16,33 +16,34 @@ #if __riscv_vector #include -#include "rvv_mathfun.h" -#include "rvv_mathfun_fp16s.h" #endif // __riscv_vector +#include "cpu.h" + namespace ncnn { Clip_riscv::Clip_riscv() { #if __riscv_vector support_packing = true; -#if __riscv_zfh - support_fp16_storage = true; -#endif #endif // __riscv_vector +#if NCNN_ZFH +#if __riscv_vector + support_fp16_storage = cpu_support_riscv_zvfh(); +#else + support_fp16_storage = cpu_support_riscv_zfh(); +#endif +#endif } int Clip_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const { -#if __riscv_vector && __riscv_zfh +#if NCNN_ZFH int elembits = bottom_top_blob.elembits(); if (opt.use_fp16_storage && elembits == 16) { - if (opt.use_fp16_arithmetic) - return forward_inplace_fp16sa(bottom_top_blob, opt); - else - return forward_inplace_fp16s(bottom_top_blob, opt); + return forward_inplace_fp16s(bottom_top_blob, opt); } #endif @@ -62,12 +63,12 @@ int Clip_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const int n = size; while (n > 0) { - size_t vl = vsetvl_e32m8(n); + size_t vl = __riscv_vsetvl_e32m8(n); - vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); - _p = vfmax_vf_f32m8(_p, min, vl); - _p = vfmin_vf_f32m8(_p, max, vl); - vse32_v_f32m8(ptr, _p, vl); + vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr, vl); + _p = __riscv_vfmax_vf_f32m8(_p, min, vl); + _p = __riscv_vfmin_vf_f32m8(_p, max, vl); + __riscv_vse32_v_f32m8(ptr, _p, vl); ptr += vl; n -= vl; @@ -89,70 +90,4 @@ int Clip_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const return 0; } -#if __riscv_vector && __riscv_zfh -int Clip_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const -{ - int w = bottom_top_blob.w; - int h = bottom_top_blob.h; - int d = bottom_top_blob.d; - int channels = bottom_top_blob.c; - int elempack = bottom_top_blob.elempack; - int size = w * h * d * elempack; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - __fp16* ptr = bottom_top_blob.channel(q); - - int n = size; - while (n > 0) - { - size_t vl = vsetvl_e16m4(n); - - vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr, vl), vl); - _p = vfmax_vf_f32m8(_p, min, vl); - _p = vfmin_vf_f32m8(_p, max, vl); - vse16_v_f16m4(ptr, vfncvt_f_f_w_f16m4(_p, vl), vl); - - ptr += vl; - n -= vl; - } - } - - return 0; -} - -int Clip_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const -{ - int w = bottom_top_blob.w; - int h = bottom_top_blob.h; - int d = bottom_top_blob.d; - int channels = bottom_top_blob.c; - int elempack = bottom_top_blob.elempack; - int size = w * h * d * elempack; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - __fp16* ptr = bottom_top_blob.channel(q); - - int n = size; - while (n > 0) - { - size_t vl = vsetvl_e16m8(n); - - vfloat16m8_t _p = vle16_v_f16m8(ptr, vl); - _p = vfmax_vf_f16m8(_p, min, vl); - _p = vfmin_vf_f16m8(_p, max, vl); - vse16_v_f16m8(ptr, _p, vl); - - ptr += vl; - n -= vl; - } - } - - return 0; -} -#endif // __riscv_vector && __riscv_zfh - } //namespace ncnn diff --git a/src/layer/riscv/clip_riscv.h b/src/layer/riscv/clip_riscv.h index 051995e18d6..241a249a204 100644 --- a/src/layer/riscv/clip_riscv.h +++ b/src/layer/riscv/clip_riscv.h @@ -27,9 +27,8 @@ class Clip_riscv : public Clip virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; protected: -#if __riscv_vector && __riscv_zfh +#if NCNN_ZFH int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const; - int forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const; #endif }; diff --git a/src/layer/riscv/clip_riscv_zfh.cpp b/src/layer/riscv/clip_riscv_zfh.cpp new file mode 100644 index 00000000000..feeca51477f --- /dev/null +++ b/src/layer/riscv/clip_riscv_zfh.cpp @@ -0,0 +1,72 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "clip_riscv.h" + +#if __riscv_vector +#include +#endif // __riscv_vector + +namespace ncnn { + +#if NCNN_ZFH +int Clip_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const +{ + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + int d = bottom_top_blob.d; + int channels = bottom_top_blob.c; + int elempack = bottom_top_blob.elempack; + int size = w * h * d * elempack; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + __fp16* ptr = bottom_top_blob.channel(q); + + __fp16 _min = (__fp16)min; + __fp16 _max = (__fp16)max; +#if __riscv_zvfh + int n = size; + while (n > 0) + { + size_t vl = __riscv_vsetvl_e16m4(n); + + vfloat32m8_t _p = __riscv_vfwcvt_f_f_v_f32m8(__riscv_vle16_v_f16m4(ptr, vl), vl); + _p = __riscv_vfmax_vf_f32m8(_p, _min, vl); + _p = __riscv_vfmin_vf_f32m8(_p, _max, vl); + __riscv_vse16_v_f16m4(ptr, __riscv_vfncvt_f_f_w_f16m4(_p, vl), vl); + + ptr += vl; + n -= vl; + } +#else // __riscv_vector + for (int i = 0; i < size; i++) + { + if (*ptr < _min) + *ptr = _min; + + if (*ptr > _max) + *ptr = _max; + + ptr++; + } +#endif // __riscv_vector + } + + return 0; +} +#endif // NCNN_ZFH + +} //namespace ncnn diff --git a/src/layer/riscv/concat_riscv.cpp b/src/layer/riscv/concat_riscv.cpp index 8eac1aba687..ae991d9251a 100644 --- a/src/layer/riscv/concat_riscv.cpp +++ b/src/layer/riscv/concat_riscv.cpp @@ -16,9 +16,10 @@ #if __riscv_vector #include +#include "riscv_usability.h" #endif // __riscv_vector -#include "riscv_usability.h" +#include "cpu.h" namespace ncnn { @@ -26,10 +27,10 @@ Concat_riscv::Concat_riscv() { #if __riscv_vector support_packing = true; -#if __riscv_zfh - support_fp16_storage = true; -#endif #endif // __riscv_vector +#if NCNN_ZFH + support_fp16_storage = cpu_support_riscv_zfh(); +#endif #if NCNN_BF16 support_bf16_storage = true; @@ -40,7 +41,7 @@ int Concat_riscv::forward(const std::vector& bottom_blobs, std::vector { int elembits = bottom_blobs[0].elembits(); -#if __riscv_vector && __riscv_zfh +#if NCNN_ZFH if (opt.use_fp16_storage && elembits == 16) return forward_bf16s_fp16s(bottom_blobs, top_blobs, opt); #endif @@ -143,7 +144,7 @@ int Concat_riscv::forward(const std::vector& bottom_blobs, std::vector #if __riscv_vector if (bottom_blob.elempack == packn && elempack == 1) { - const size_t vl = vsetvl_e32m1(packn); + const size_t vl = __riscv_vsetvl_e32m1(packn); for (int i = 0; i < bottom_blob.h; i++) { @@ -153,8 +154,8 @@ int Concat_riscv::forward(const std::vector& bottom_blobs, std::vector for (int j = 0; j < w; j++) { - vfloat32m1_t _p = vle32_v_f32m1(r0, vl); - vsse32_v_f32m1(outptr0, w * sizeof(float), _p, vl); + vfloat32m1_t _p = __riscv_vle32_v_f32m1(r0, vl); + __riscv_vsse32_v_f32m1(outptr0, w * sizeof(float), _p, vl); r0 += packn; outptr0 += 1; @@ -271,7 +272,7 @@ int Concat_riscv::forward(const std::vector& bottom_blobs, std::vector #if __riscv_vector if (bottom_blob.elempack == packn && elempack == 1) { - const size_t vl = vsetvl_e32m1(packn); + const size_t vl = __riscv_vsetvl_e32m1(packn); int size = bottom_blob.w * bottom_blob.h * bottom_blob.d; @@ -283,8 +284,8 @@ int Concat_riscv::forward(const std::vector& bottom_blobs, std::vector for (int i = 0; i < size; i++) { - vfloat32m1_t _p = vle32_v_f32m1(r0, vl); - vsse32_v_f32m1(outptr0, top_blob_unpacked.cstep * sizeof(float), _p, vl); + vfloat32m1_t _p = __riscv_vle32_v_f32m1(r0, vl); + __riscv_vsse32_v_f32m1(outptr0, top_blob_unpacked.cstep * sizeof(float), _p, vl); r0 += packn; outptr0 += 1; @@ -545,7 +546,7 @@ int Concat_riscv::forward_bf16s_fp16s(const std::vector& bottom_blobs, std: #if __riscv_vector if (bottom_blob.elempack == packn && elempack == 1) { - const size_t vl = vsetvl_e16m1(packn); + const size_t vl = __riscv_vsetvl_e16m1(packn); for (int i = 0; i < bottom_blob.h; i++) { @@ -555,8 +556,8 @@ int Concat_riscv::forward_bf16s_fp16s(const std::vector& bottom_blobs, std: for (int j = 0; j < w; j++) { - vuint16m1_t _p = vle16_v_u16m1(r0, vl); - vsse16_v_u16m1(outptr0, w * sizeof(unsigned short), _p, vl); + vuint16m1_t _p = __riscv_vle16_v_u16m1(r0, vl); + __riscv_vsse16_v_u16m1(outptr0, w * sizeof(unsigned short), _p, vl); r0 += packn; outptr0 += 1; @@ -673,7 +674,7 @@ int Concat_riscv::forward_bf16s_fp16s(const std::vector& bottom_blobs, std: #if __riscv_vector if (bottom_blob.elempack == packn && elempack == 1) { - const size_t vl = vsetvl_e16m1(packn); + const size_t vl = __riscv_vsetvl_e16m1(packn); int size = bottom_blob.w * bottom_blob.h * bottom_blob.d; @@ -685,8 +686,8 @@ int Concat_riscv::forward_bf16s_fp16s(const std::vector& bottom_blobs, std: for (int i = 0; i < size; i++) { - vuint16m1_t _p = vle16_v_u16m1(r0, vl); - vsse16_v_u16m1(outptr0, top_blob_unpacked.cstep * sizeof(unsigned short), _p, vl); + vuint16m1_t _p = __riscv_vle16_v_u16m1(r0, vl); + __riscv_vsse16_v_u16m1(outptr0, top_blob_unpacked.cstep * sizeof(unsigned short), _p, vl); r0 += packn; outptr0 += 1; diff --git a/src/layer/riscv/convolution1d_riscv.cpp b/src/layer/riscv/convolution1d_riscv.cpp index 5671d4d4226..ca01f6c4a78 100644 --- a/src/layer/riscv/convolution1d_riscv.cpp +++ b/src/layer/riscv/convolution1d_riscv.cpp @@ -30,10 +30,14 @@ Convolution1D_riscv::Convolution1D_riscv() { #if __riscv_vector support_packing = true; -#if __riscv_zfh - support_fp16_storage = true; -#endif #endif // __riscv_vector +#if NCNN_ZFH +#if __riscv_vector + support_fp16_storage = cpu_support_riscv_zvfh(); +#else + support_fp16_storage = cpu_support_riscv_zfh(); +#endif +#endif } int Convolution1D_riscv::create_pipeline(const Option& opt) @@ -41,8 +45,8 @@ int Convolution1D_riscv::create_pipeline(const Option& opt) if (dynamic_weight) return 0; -#if __riscv_vector && __riscv_zfh - if (opt.use_fp16_storage) +#if NCNN_ZFH + if (support_fp16_storage && opt.use_fp16_storage) { return create_pipeline_fp16s(opt); } @@ -108,9 +112,9 @@ int Convolution1D_riscv::destroy_pipeline(const Option& /*opt*/) int Convolution1D_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const { +#if NCNN_ZFH int elembits = bottom_blob.elembits(); -#if __riscv_vector && __riscv_zfh if (opt.use_fp16_storage && elembits == 16) { if (opt.use_fp16_arithmetic) @@ -122,7 +126,7 @@ int Convolution1D_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Op #if __riscv_vector const int packn = csrr_vlenb() / 4; - const size_t vl = vsetvl_e32m1(packn); + const size_t vl = __riscv_vsetvl_e32m1(packn); #endif int w = bottom_blob.w; @@ -167,11 +171,11 @@ int Convolution1D_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Op for (int j = 0; j < outw; j++) { - vfloat32m1_t _sum = vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _sum = __riscv_vfmv_v_f_f32m1(0.f, vl); if (bias_term) { - _sum = vle32_v_f32m1((const float*)bias_data + p * packn, vl); + _sum = __riscv_vle32_v_f32m1((const float*)bias_data + p * packn, vl); } const float* kptr = weight_data_packed.channel(p); @@ -187,8 +191,8 @@ int Convolution1D_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Op for (int l = 0; l < packn; l++) { float val = *slptr++; - vfloat32m1_t _w0 = vle32_v_f32m1(kptr, vl); - _sum = vfmacc_vf_f32m1(_sum, val, _w0, vl); + vfloat32m1_t _w0 = __riscv_vle32_v_f32m1(kptr, vl); + _sum = __riscv_vfmacc_vf_f32m1(_sum, val, _w0, vl); kptr += packn; } @@ -197,7 +201,7 @@ int Convolution1D_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Op _sum = activation_ps(_sum, activation_type, activation_params, vl); - vse32_v_f32m1(outptr, _sum, vl); + __riscv_vse32_v_f32m1(outptr, _sum, vl); outptr += packn; } } @@ -214,11 +218,11 @@ int Convolution1D_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Op for (int j = 0; j < outw; j++) { - vfloat32m1_t _sum = vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _sum = __riscv_vfmv_v_f_f32m1(0.f, vl); if (bias_term) { - _sum = vle32_v_f32m1((const float*)bias_data + p * packn, vl); + _sum = __riscv_vle32_v_f32m1((const float*)bias_data + p * packn, vl); } const float* kptr = weight_data_packed.channel(p); @@ -230,8 +234,8 @@ int Convolution1D_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Op for (int k = 0; k < kernel_w; k++) { float val = sptr[0]; - vfloat32m1_t _w = vle32_v_f32m1(kptr, vl); - _sum = vfmacc_vf_f32m1(_sum, val, _w, vl); + vfloat32m1_t _w = __riscv_vle32_v_f32m1(kptr, vl); + _sum = __riscv_vfmacc_vf_f32m1(_sum, val, _w, vl); sptr += dilation_w; kptr += packn; @@ -240,7 +244,7 @@ int Convolution1D_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Op _sum = activation_ps(_sum, activation_type, activation_params, vl); - vse32_v_f32m1(outptr, _sum, vl); + __riscv_vse32_v_f32m1(outptr, _sum, vl); outptr += packn; } } @@ -264,7 +268,7 @@ int Convolution1D_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Op sum = bias_data[p]; } - vfloat32m1_t _sum = vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _sum = __riscv_vfmv_v_f_f32m1(0.f, vl); const float* kptr = weight_data_packed.channel(p); @@ -274,16 +278,16 @@ int Convolution1D_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Op for (int k = 0; k < kernel_w; k++) { - vfloat32m1_t _val = vle32_v_f32m1(sptr, vl); - vfloat32m1_t _w = vle32_v_f32m1(kptr, vl); - _sum = vfmacc_vv_f32m1(_sum, _val, _w, vl); + vfloat32m1_t _val = __riscv_vle32_v_f32m1(sptr, vl); + vfloat32m1_t _w = __riscv_vle32_v_f32m1(kptr, vl); + _sum = __riscv_vfmacc_vv_f32m1(_sum, _val, _w, vl); sptr += dilation_w * packn; kptr += packn; } } - sum = vfmv_f_s_f32m1_f32(vfredusum_vs_f32m1_f32m1(vfloat32m1_t(), _sum, vfmv_s_f_f32m1(vfloat32m1_t(), sum, vl), vl)); + sum = __riscv_vfmv_f_s_f32m1_f32(__riscv_vfredusum_vs_f32m1_f32m1(_sum, __riscv_vfmv_s_f_f32m1(sum, vl), vl)); sum = activation_ss(sum, activation_type, activation_params); @@ -354,7 +358,7 @@ int Convolution1D_riscv::forward(const std::vector& bottom_blobs, std::vect return -100; #if NCNN_RVV - if (opt.use_fp16_storage && cpu_support_riscv_v() && cpu_support_riscv_zfh() && weight_data_flattened.elembits() == 16) + if (opt.use_fp16_storage && cpu_support_riscv_zvfh() && weight_data_flattened.elembits() == 16) { Mat weight_data_flattened_fp32; cast_float16_to_float32(weight_data_flattened, weight_data_flattened_fp32, opt); @@ -376,7 +380,7 @@ int Convolution1D_riscv::forward(const std::vector& bottom_blobs, std::vect return -100; #if NCNN_RVV - if (opt.use_fp16_storage && cpu_support_riscv_v() && cpu_support_riscv_zfh() && bias_data_flattened.elembits() == 16) + if (opt.use_fp16_storage && cpu_support_riscv_zvfh() && bias_data_flattened.elembits() == 16) { Mat bias_data_flattened_fp32; cast_float16_to_float32(bias_data_flattened, bias_data_flattened_fp32, opt); @@ -424,492 +428,4 @@ int Convolution1D_riscv::forward(const std::vector& bottom_blobs, std::vect return 0; } -#if __riscv_vector && __riscv_zfh -int Convolution1D_riscv::create_pipeline_fp16s(const Option& opt) -{ - const int packn = csrr_vlenb() / 2; - - const int num_input = weight_data_size / kernel_w / num_output; - - int elempack = 1; - int out_elempack = 1; - - if (opt.use_packing_layout) - { - elempack = num_input % packn == 0 ? packn : 1; - out_elempack = num_output % packn == 0 ? packn : 1; - } - - // src = kw-inch-outch - // dst = pb-pa-kw-inch/pa-outch/pb - { - Mat weight_data_r2 = weight_data.reshape(kernel_w, num_input, num_output); - - weight_data_fp16.create(kernel_w, num_input / elempack, num_output / out_elempack, (size_t)2u * elempack * out_elempack, elempack * out_elempack); - - for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack) - { - __fp16* g00 = weight_data_fp16.channel(q / out_elempack); - - for (int p = 0; p + (elempack - 1) < num_input; p += elempack) - { - for (int k = 0; k < kernel_w; k++) - { - for (int i = 0; i < elempack; i++) - { - for (int j = 0; j < out_elempack; j++) - { - const float* k00 = weight_data_r2.channel(q + j).row(p + i); - - g00[0] = (__fp16)k00[k]; - - g00++; - } - } - } - } - } - } - - ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt); - - if (opt.lightmode) - weight_data.release(); - - return 0; -} - -int Convolution1D_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const -{ - const int packn = csrr_vlenb() / 2; - const size_t vl = vsetvl_e16m1(packn); - - int w = bottom_blob.w; - int h = bottom_blob.h; - size_t elemsize = bottom_blob.elemsize; - int elempack = bottom_blob.elempack; - - const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; - - Mat bottom_blob_bordered; - make_padding(bottom_blob, bottom_blob_bordered, opt); - if (bottom_blob_bordered.empty()) - return -100; - - w = bottom_blob_bordered.w; - h = bottom_blob_bordered.h; - - int out_elempack = (opt.use_packing_layout && num_output % packn == 0) ? packn : 1; - size_t out_elemsize = elemsize / elempack * out_elempack; - - const int outw = (w - kernel_extent_w) / stride_w + 1; - const int outh = num_output / out_elempack; - - top_blob.create(outw, outh, out_elemsize, out_elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (elempack == packn && out_elempack == packn) - { - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int p = 0; p < outh; p++) - { - __fp16* outptr = top_blob.row<__fp16>(p); - - for (int j = 0; j < outw; j++) - { - vfloat32m2_t _sum = vfmv_v_f_f32m2(0.f, vl); - - if (bias_term) - { - _sum = vle32_v_f32m2((const float*)bias_data + p * packn, vl); - } - - const __fp16* kptr = weight_data_fp16.channel(p); - - for (int q = 0; q < h; q++) - { - const __fp16* sptr = bottom_blob_bordered.row(q) + j * stride_w * packn; - - for (int k = 0; k < kernel_w; k++) - { - const __fp16* slptr = sptr + k * dilation_w * packn; - - for (int l = 0; l < packn; l++) - { - float val = (float)*slptr++; - vfloat16m1_t _w0 = vle16_v_f16m1(kptr, vl); - _sum = vfwmacc_vf_f32m2(_sum, val, _w0, vl); - - kptr += packn; - } - } - } - - _sum = activation_ps(_sum, activation_type, activation_params, vl); - - vse16_v_f16m1(outptr, vfncvt_f_f_w_f16m1(_sum, vl), vl); - outptr += packn; - } - } - } - } - - if (elempack == 1 && out_elempack == packn) - { - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int p = 0; p < outh; p++) - { - __fp16* outptr = top_blob.row<__fp16>(p); - - for (int j = 0; j < outw; j++) - { - vfloat32m2_t _sum = vfmv_v_f_f32m2(0.f, vl); - - if (bias_term) - { - _sum = vle32_v_f32m2((const float*)bias_data + p * packn, vl); - } - - const __fp16* kptr = weight_data_fp16.channel(p); - - for (int q = 0; q < h; q++) - { - const __fp16* sptr = bottom_blob_bordered.row(q) + j * stride_w; - - for (int k = 0; k < kernel_w; k++) - { - float val = (float)sptr[0]; - vfloat16m1_t _w = vle16_v_f16m1(kptr, vl); - _sum = vfwmacc_vf_f32m2(_sum, val, _w, vl); - - sptr += dilation_w; - kptr += packn; - } - } - - _sum = activation_ps(_sum, activation_type, activation_params, vl); - - vse16_v_f16m1(outptr, vfncvt_f_f_w_f16m1(_sum, vl), vl); - outptr += packn; - } - } - } - } - - if (elempack == packn && out_elempack == 1) - { - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int p = 0; p < outh; p++) - { - __fp16* outptr = top_blob.row<__fp16>(p); - - for (int j = 0; j < outw; j++) - { - float sum = 0.f; - - if (bias_term) - { - sum = bias_data[p]; - } - - vfloat32m2_t _sum = vfmv_v_f_f32m2(0.f, vl); - - const __fp16* kptr = weight_data_fp16.channel(p); - - for (int q = 0; q < h; q++) - { - const __fp16* sptr = bottom_blob_bordered.row(q) + j * stride_w * packn; - - for (int k = 0; k < kernel_w; k++) - { - vfloat16m1_t _val = vle16_v_f16m1(sptr, vl); - vfloat16m1_t _w = vle16_v_f16m1(kptr, vl); - _sum = vfwmacc_vv_f32m2(_sum, _val, _w, vl); - - sptr += dilation_w * packn; - kptr += packn; - } - } - -#if C906 - // TODO - std::vector ss(packn); - vse32_v_f32m2((float*)ss.data(), _sum, vl); - for (int i = 0; i < packn; i++) - { - sum += ss[i]; - } -#else - sum = vfmv_f_s_f32m1_f32(vfredusum_vs_f32m2_f32m1(vfloat32m1_t(), _sum, vfmv_s_f_f32m1(vfloat32m1_t(), sum, vl), vl)); -#endif - - sum = activation_ss(sum, activation_type, activation_params); - - outptr[j] = (__fp16)sum; - } - } - } - } - - if (elempack == 1 && out_elempack == 1) - { - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int p = 0; p < outh; p++) - { - __fp16* outptr = top_blob.row<__fp16>(p); - - for (int j = 0; j < outw; j++) - { - float sum = 0.f; - - if (bias_term) - { - sum = bias_data[p]; - } - - const __fp16* kptr = weight_data_fp16.channel(p); - - for (int q = 0; q < h; q++) - { - const __fp16* sptr = bottom_blob_bordered.row<__fp16>(q) + j * stride_w; - - for (int k = 0; k < kernel_w; k++) - { - float val = (float)sptr[0]; - float w = (float)kptr[0]; - sum += val * w; - - sptr += dilation_w; - kptr += 1; - } - } - - sum = activation_ss(sum, activation_type, activation_params); - - outptr[j] = (__fp16)sum; - } - } - } - } - - return 0; -} - -int Convolution1D_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const -{ - const int packn = csrr_vlenb() / 2; - const size_t vl = vsetvl_e16m1(packn); - - int w = bottom_blob.w; - int h = bottom_blob.h; - size_t elemsize = bottom_blob.elemsize; - int elempack = bottom_blob.elempack; - - const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; - - Mat bottom_blob_bordered; - make_padding(bottom_blob, bottom_blob_bordered, opt); - if (bottom_blob_bordered.empty()) - return -100; - - w = bottom_blob_bordered.w; - h = bottom_blob_bordered.h; - - int out_elempack = (opt.use_packing_layout && num_output % packn == 0) ? packn : 1; - size_t out_elemsize = elemsize / elempack * out_elempack; - - const int outw = (w - kernel_extent_w) / stride_w + 1; - const int outh = num_output / out_elempack; - - top_blob.create(outw, outh, out_elemsize, out_elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (elempack == packn && out_elempack == packn) - { - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int p = 0; p < outh; p++) - { - __fp16* outptr = top_blob.row<__fp16>(p); - - for (int j = 0; j < outw; j++) - { - vfloat16m1_t _sum = vfmv_v_f_f16m1(0.f, vl); - - if (bias_term) - { - _sum = vle16_v_f16m1((const __fp16*)bias_data_fp16 + p * packn, vl); - } - - const __fp16* kptr = weight_data_fp16.channel(p); - - for (int q = 0; q < h; q++) - { - const __fp16* sptr = bottom_blob_bordered.row(q) + j * stride_w * packn; - - for (int k = 0; k < kernel_w; k++) - { - const __fp16* slptr = sptr + k * dilation_w * packn; - - for (int l = 0; l < packn; l++) - { - __fp16 val = *slptr++; - vfloat16m1_t _w0 = vle16_v_f16m1(kptr, vl); - _sum = vfmacc_vf_f16m1(_sum, val, _w0, vl); - - kptr += packn; - } - } - } - - _sum = activation_ps(_sum, activation_type, activation_params, vl); - - vse16_v_f16m1(outptr, _sum, vl); - outptr += packn; - } - } - } - } - - if (elempack == 1 && out_elempack == packn) - { - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int p = 0; p < outh; p++) - { - __fp16* outptr = top_blob.row<__fp16>(p); - - for (int j = 0; j < outw; j++) - { - vfloat16m1_t _sum = vfmv_v_f_f16m1(0.f, vl); - - if (bias_term) - { - _sum = vle16_v_f16m1((const __fp16*)bias_data_fp16 + p * packn, vl); - } - - const __fp16* kptr = weight_data_fp16.channel(p); - - for (int q = 0; q < h; q++) - { - const __fp16* sptr = bottom_blob_bordered.row(q) + j * stride_w; - - for (int k = 0; k < kernel_w; k++) - { - __fp16 val = sptr[0]; - vfloat16m1_t _w = vle16_v_f16m1(kptr, vl); - _sum = vfmacc_vf_f16m1(_sum, val, _w, vl); - - sptr += dilation_w; - kptr += packn; - } - } - - _sum = activation_ps(_sum, activation_type, activation_params, vl); - - vse16_v_f16m1(outptr, _sum, vl); - outptr += packn; - } - } - } - } - - if (elempack == packn && out_elempack == 1) - { - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int p = 0; p < outh; p++) - { - __fp16* outptr = top_blob.row<__fp16>(p); - - for (int j = 0; j < outw; j++) - { - __fp16 sum = 0.f; - - if (bias_term) - { - sum = ((const __fp16*)bias_data_fp16)[p]; - } - - vfloat16m1_t _sum = vfmv_v_f_f16m1(0.f, vl); - - const __fp16* kptr = weight_data_fp16.channel(p); - - for (int q = 0; q < h; q++) - { - const __fp16* sptr = bottom_blob_bordered.row(q) + j * stride_w * packn; - - for (int k = 0; k < kernel_w; k++) - { - vfloat16m1_t _val = vle16_v_f16m1(sptr, vl); - vfloat16m1_t _w = vle16_v_f16m1(kptr, vl); - _sum = vfmacc_vv_f16m1(_sum, _val, _w, vl); - - sptr += dilation_w * packn; - kptr += packn; - } - } - - sum = vfmv_f_s_f16m1_f16(vfredusum_vs_f16m1_f16m1(vfloat16m1_t(), _sum, vfmv_s_f_f16m1(vfloat16m1_t(), sum, vl), vl)); - - sum = activation_ss(sum, activation_type, activation_params); - - outptr[j] = sum; - } - } - } - } - - if (elempack == 1 && out_elempack == 1) - { - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int p = 0; p < outh; p++) - { - __fp16* outptr = top_blob.row<__fp16>(p); - - for (int j = 0; j < outw; j++) - { - float sum = 0.f; - - if (bias_term) - { - sum = bias_data[p]; - } - - const __fp16* kptr = weight_data_fp16.channel(p); - - for (int q = 0; q < h; q++) - { - const __fp16* sptr = bottom_blob_bordered.row<__fp16>(q) + j * stride_w; - - for (int k = 0; k < kernel_w; k++) - { - float val = (float)sptr[0]; - float w = (float)kptr[0]; - sum += val * w; - - sptr += dilation_w; - kptr += 1; - } - } - - sum = activation_ss(sum, activation_type, activation_params); - - outptr[j] = (__fp16)sum; - } - } - } - } - - return 0; -} -#endif // __riscv_vector && __riscv_zfh - } // namespace ncnn diff --git a/src/layer/riscv/convolution1d_riscv.h b/src/layer/riscv/convolution1d_riscv.h index f0e7f881801..98a21be002f 100644 --- a/src/layer/riscv/convolution1d_riscv.h +++ b/src/layer/riscv/convolution1d_riscv.h @@ -32,7 +32,7 @@ class Convolution1D_riscv : public Convolution1D virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; protected: -#if __riscv_vector && __riscv_zfh +#if NCNN_ZFH int create_pipeline_fp16s(const Option& opt); int forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; diff --git a/src/layer/riscv/convolution1d_riscv_zfh.cpp b/src/layer/riscv/convolution1d_riscv_zfh.cpp new file mode 100644 index 00000000000..47a9e7a5a9a --- /dev/null +++ b/src/layer/riscv/convolution1d_riscv_zfh.cpp @@ -0,0 +1,540 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "convolution1d_riscv.h" + +#if __riscv_vector +#include +#endif // __riscv_vector + +#include "riscv_activation.h" +#include "riscv_usability.h" + +#include "cpu.h" +#include "layer_type.h" + +namespace ncnn { + +#if NCNN_ZFH +int Convolution1D_riscv::create_pipeline_fp16s(const Option& opt) +{ +#if __riscv_zvfh + const int packn = csrr_vlenb() / 2; +#endif // __riscv_zvfh + + const int num_input = weight_data_size / kernel_w / num_output; + + int elempack = 1; + int out_elempack = 1; +#if __riscv_zvfh + if (opt.use_packing_layout) + { + elempack = num_input % packn == 0 ? packn : 1; + out_elempack = num_output % packn == 0 ? packn : 1; + } +#endif // __riscv_zvfh + + // src = kw-inch-outch + // dst = pb-pa-kw-inch/pa-outch/pb + { + Mat weight_data_r2 = weight_data.reshape(kernel_w, num_input, num_output); + + weight_data_fp16.create(kernel_w, num_input / elempack, num_output / out_elempack, (size_t)2u * elempack * out_elempack, elempack * out_elempack); + + for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack) + { + __fp16* g00 = weight_data_fp16.channel(q / out_elempack); + + for (int p = 0; p + (elempack - 1) < num_input; p += elempack) + { + for (int k = 0; k < kernel_w; k++) + { + for (int i = 0; i < elempack; i++) + { + for (int j = 0; j < out_elempack; j++) + { + const float* k00 = weight_data_r2.channel(q + j).row(p + i); + + g00[0] = (__fp16)k00[k]; + + g00++; + } + } + } + } + } + } + + ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt); + + if (opt.lightmode) + weight_data.release(); + + return 0; +} + +int Convolution1D_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ +#if __riscv_zvfh + const int packn = csrr_vlenb() / 2; + const size_t vl = __riscv_vsetvl_e16m1(packn); +#endif // __riscv_zvfh + + int w = bottom_blob.w; + int h = bottom_blob.h; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; + + Mat bottom_blob_bordered; + make_padding(bottom_blob, bottom_blob_bordered, opt); + if (bottom_blob_bordered.empty()) + return -100; + + w = bottom_blob_bordered.w; + h = bottom_blob_bordered.h; + + int out_elempack = 1; +#if __riscv_zvfh + if (opt.use_packing_layout) + { + out_elempack = num_output % packn == 0 ? packn : 1; + } +#endif // __riscv_zvfh + size_t out_elemsize = elemsize / elempack * out_elempack; + + const int outw = (w - kernel_extent_w) / stride_w + 1; + const int outh = num_output / out_elempack; + + top_blob.create(outw, outh, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + +#if __riscv_zvfh + if (elempack == packn && out_elempack == packn) + { + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outh; p++) + { + __fp16* outptr = top_blob.row<__fp16>(p); + + for (int j = 0; j < outw; j++) + { + vfloat32m2_t _sum = __riscv_vfmv_v_f_f32m2(0.f, vl); + + if (bias_term) + { + _sum = __riscv_vle32_v_f32m2((const float*)bias_data + p * packn, vl); + } + + const __fp16* kptr = weight_data_fp16.channel(p); + + for (int q = 0; q < h; q++) + { + const __fp16* sptr = bottom_blob_bordered.row(q) + j * stride_w * packn; + + for (int k = 0; k < kernel_w; k++) + { + const __fp16* slptr = sptr + k * dilation_w * packn; + + for (int l = 0; l < packn; l++) + { + float val = (float)*slptr++; + vfloat16m1_t _w0 = __riscv_vle16_v_f16m1(kptr, vl); + _sum = __riscv_vfwmacc_vf_f32m2(_sum, val, _w0, vl); + + kptr += packn; + } + } + } + + _sum = activation_ps(_sum, activation_type, activation_params, vl); + + __riscv_vse16_v_f16m1(outptr, __riscv_vfncvt_f_f_w_f16m1(_sum, vl), vl); + outptr += packn; + } + } + } + } + + if (elempack == 1 && out_elempack == packn) + { + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outh; p++) + { + __fp16* outptr = top_blob.row<__fp16>(p); + + for (int j = 0; j < outw; j++) + { + vfloat32m2_t _sum = __riscv_vfmv_v_f_f32m2(0.f, vl); + + if (bias_term) + { + _sum = __riscv_vle32_v_f32m2((const float*)bias_data + p * packn, vl); + } + + const __fp16* kptr = weight_data_fp16.channel(p); + + for (int q = 0; q < h; q++) + { + const __fp16* sptr = bottom_blob_bordered.row(q) + j * stride_w; + + for (int k = 0; k < kernel_w; k++) + { + float val = (float)sptr[0]; + vfloat16m1_t _w = __riscv_vle16_v_f16m1(kptr, vl); + _sum = __riscv_vfwmacc_vf_f32m2(_sum, val, _w, vl); + + sptr += dilation_w; + kptr += packn; + } + } + + _sum = activation_ps(_sum, activation_type, activation_params, vl); + + __riscv_vse16_v_f16m1(outptr, __riscv_vfncvt_f_f_w_f16m1(_sum, vl), vl); + outptr += packn; + } + } + } + } + + if (elempack == packn && out_elempack == 1) + { + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outh; p++) + { + __fp16* outptr = top_blob.row<__fp16>(p); + + for (int j = 0; j < outw; j++) + { + float sum = 0.f; + + if (bias_term) + { + sum = bias_data[p]; + } + + vfloat32m2_t _sum = __riscv_vfmv_v_f_f32m2(0.f, vl); + + const __fp16* kptr = weight_data_fp16.channel(p); + + for (int q = 0; q < h; q++) + { + const __fp16* sptr = bottom_blob_bordered.row(q) + j * stride_w * packn; + + for (int k = 0; k < kernel_w; k++) + { + vfloat16m1_t _val = __riscv_vle16_v_f16m1(sptr, vl); + vfloat16m1_t _w = __riscv_vle16_v_f16m1(kptr, vl); + _sum = __riscv_vfwmacc_vv_f32m2(_sum, _val, _w, vl); + + sptr += dilation_w * packn; + kptr += packn; + } + } + +#if C906 + // TODO + std::vector ss(packn); + __riscv_vse32_v_f32m2((float*)ss.data(), _sum, vl); + for (int i = 0; i < packn; i++) + { + sum += ss[i]; + } +#else + sum = __riscv_vfmv_f_s_f32m1_f32(__riscv_vfredusum_vs_f32m2_f32m1(_sum, __riscv_vfmv_s_f_f32m1(sum, vl), vl)); +#endif + + sum = activation_ss(sum, activation_type, activation_params); + + outptr[j] = (__fp16)sum; + } + } + } + } +#endif // __riscv_zvfh + + if (elempack == 1 && out_elempack == 1) + { + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outh; p++) + { + __fp16* outptr = top_blob.row<__fp16>(p); + + for (int j = 0; j < outw; j++) + { + float sum = 0.f; + + if (bias_term) + { + sum = bias_data[p]; + } + + const __fp16* kptr = weight_data_fp16.channel(p); + + for (int q = 0; q < h; q++) + { + const __fp16* sptr = bottom_blob_bordered.row<__fp16>(q) + j * stride_w; + + for (int k = 0; k < kernel_w; k++) + { + float val = (float)sptr[0]; + float w = (float)kptr[0]; + sum += val * w; + + sptr += dilation_w; + kptr += 1; + } + } + + sum = activation_ss(sum, activation_type, activation_params); + + outptr[j] = (__fp16)sum; + } + } + } + } + + return 0; +} + +int Convolution1D_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ +#if __riscv_zvfh + const int packn = csrr_vlenb() / 2; + const size_t vl = __riscv_vsetvl_e16m1(packn); +#endif // __riscv_zvfh + + int w = bottom_blob.w; + int h = bottom_blob.h; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; + + Mat bottom_blob_bordered; + make_padding(bottom_blob, bottom_blob_bordered, opt); + if (bottom_blob_bordered.empty()) + return -100; + + w = bottom_blob_bordered.w; + h = bottom_blob_bordered.h; + + int out_elempack = 1; +#if __riscv_zvfh + if (opt.use_packing_layout) + { + out_elempack = num_output % packn == 0 ? packn : 1; + } +#endif // __riscv_zvfh + size_t out_elemsize = elemsize / elempack * out_elempack; + + const int outw = (w - kernel_extent_w) / stride_w + 1; + const int outh = num_output / out_elempack; + + top_blob.create(outw, outh, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + +#if __riscv_zvfh + if (elempack == packn && out_elempack == packn) + { + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outh; p++) + { + __fp16* outptr = top_blob.row<__fp16>(p); + + for (int j = 0; j < outw; j++) + { + vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1(0.f, vl); + + if (bias_term) + { + _sum = __riscv_vle16_v_f16m1((const __fp16*)bias_data_fp16 + p * packn, vl); + } + + const __fp16* kptr = weight_data_fp16.channel(p); + + for (int q = 0; q < h; q++) + { + const __fp16* sptr = bottom_blob_bordered.row(q) + j * stride_w * packn; + + for (int k = 0; k < kernel_w; k++) + { + const __fp16* slptr = sptr + k * dilation_w * packn; + + for (int l = 0; l < packn; l++) + { + __fp16 val = *slptr++; + vfloat16m1_t _w0 = __riscv_vle16_v_f16m1(kptr, vl); + _sum = __riscv_vfmacc_vf_f16m1(_sum, val, _w0, vl); + + kptr += packn; + } + } + } + + _sum = activation_ps(_sum, activation_type, activation_params, vl); + + __riscv_vse16_v_f16m1(outptr, _sum, vl); + outptr += packn; + } + } + } + } + + if (elempack == 1 && out_elempack == packn) + { + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outh; p++) + { + __fp16* outptr = top_blob.row<__fp16>(p); + + for (int j = 0; j < outw; j++) + { + vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1(0.f, vl); + + if (bias_term) + { + _sum = __riscv_vle16_v_f16m1((const __fp16*)bias_data_fp16 + p * packn, vl); + } + + const __fp16* kptr = weight_data_fp16.channel(p); + + for (int q = 0; q < h; q++) + { + const __fp16* sptr = bottom_blob_bordered.row(q) + j * stride_w; + + for (int k = 0; k < kernel_w; k++) + { + __fp16 val = sptr[0]; + vfloat16m1_t _w = __riscv_vle16_v_f16m1(kptr, vl); + _sum = __riscv_vfmacc_vf_f16m1(_sum, val, _w, vl); + + sptr += dilation_w; + kptr += packn; + } + } + + _sum = activation_ps(_sum, activation_type, activation_params, vl); + + __riscv_vse16_v_f16m1(outptr, _sum, vl); + outptr += packn; + } + } + } + } + + if (elempack == packn && out_elempack == 1) + { + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outh; p++) + { + __fp16* outptr = top_blob.row<__fp16>(p); + + for (int j = 0; j < outw; j++) + { + __fp16 sum = 0.f; + + if (bias_term) + { + sum = ((const __fp16*)bias_data_fp16)[p]; + } + + vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1(0.f, vl); + + const __fp16* kptr = weight_data_fp16.channel(p); + + for (int q = 0; q < h; q++) + { + const __fp16* sptr = bottom_blob_bordered.row(q) + j * stride_w * packn; + + for (int k = 0; k < kernel_w; k++) + { + vfloat16m1_t _val = __riscv_vle16_v_f16m1(sptr, vl); + vfloat16m1_t _w = __riscv_vle16_v_f16m1(kptr, vl); + _sum = __riscv_vfmacc_vv_f16m1(_sum, _val, _w, vl); + + sptr += dilation_w * packn; + kptr += packn; + } + } + + sum = __riscv_vfmv_f_s_f16m1_f16(__riscv_vfredusum_vs_f16m1_f16m1(_sum, __riscv_vfmv_s_f_f16m1(sum, vl), vl)); + + sum = activation_ss(sum, activation_type, activation_params); + + outptr[j] = sum; + } + } + } + } +#endif // __riscv_zvfh + + if (elempack == 1 && out_elempack == 1) + { + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outh; p++) + { + __fp16* outptr = top_blob.row<__fp16>(p); + + for (int j = 0; j < outw; j++) + { + float sum = 0.f; + + if (bias_term) + { + sum = bias_data[p]; + } + + const __fp16* kptr = weight_data_fp16.channel(p); + + for (int q = 0; q < h; q++) + { + const __fp16* sptr = bottom_blob_bordered.row<__fp16>(q) + j * stride_w; + + for (int k = 0; k < kernel_w; k++) + { + float val = (float)sptr[0]; + float w = (float)kptr[0]; + sum += val * w; + + sptr += dilation_w; + kptr += 1; + } + } + + sum = activation_ss(sum, activation_type, activation_params); + + outptr[j] = (__fp16)sum; + } + } + } + } + + return 0; +} +#endif // NCNN_ZFH + +} // namespace ncnn diff --git a/src/layer/riscv/convolution_1x1_packn.h b/src/layer/riscv/convolution_1x1_packn.h index 31bf72ba3d0..3876923a699 100644 --- a/src/layer/riscv/convolution_1x1_packn.h +++ b/src/layer/riscv/convolution_1x1_packn.h @@ -28,7 +28,7 @@ static void conv1x1s1_sgemm_packn_rvv(const Mat& bottom_blob, Mat& top_blob, con static void conv1x1s2_sgemm_packn_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) { const int packn = csrr_vlenb() / 4; - const size_t vl = vsetvl_e32m1(packn); + const size_t vl = __riscv_vsetvl_e32m1(packn); int w = bottom_blob.w; int channels = bottom_blob.c; @@ -53,8 +53,8 @@ static void conv1x1s2_sgemm_packn_rvv(const Mat& bottom_blob, Mat& top_blob, con { for (int j = 0; j < outw; j++) { - vfloat32m1_t _val = vle32_v_f32m1(r0, vl); - vse32_v_f32m1(outptr, _val, vl); + vfloat32m1_t _val = __riscv_vle32_v_f32m1(r0, vl); + __riscv_vse32_v_f32m1(outptr, _val, vl); r0 += packn * 2; outptr += packn; diff --git a/src/layer/riscv/convolution_1x1_packn_fp16s.h b/src/layer/riscv/convolution_1x1_packn_fp16s.h index 5ac3f8967ce..f707b32d8e8 100644 --- a/src/layer/riscv/convolution_1x1_packn_fp16s.h +++ b/src/layer/riscv/convolution_1x1_packn_fp16s.h @@ -28,7 +28,7 @@ static void conv1x1s1_sgemm_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& top_bl static void conv1x1s2_sgemm_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) { const int packn = csrr_vlenb() / 2; - const size_t vl = vsetvl_e16m1(packn); + const size_t vl = __riscv_vsetvl_e16m1(packn); int w = bottom_blob.w; int channels = bottom_blob.c; @@ -53,8 +53,8 @@ static void conv1x1s2_sgemm_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& top_bl { for (int j = 0; j < outw; j++) { - vfloat16m1_t _val = vle16_v_f16m1(r0, vl); - vse16_v_f16m1(outptr, _val, vl); + vfloat16m1_t _val = __riscv_vle16_v_f16m1(r0, vl); + __riscv_vse16_v_f16m1(outptr, _val, vl); r0 += packn * 2; outptr += packn; diff --git a/src/layer/riscv/convolution_1x1_packnto1.h b/src/layer/riscv/convolution_1x1_packnto1.h index a3e1204a325..3e425813ecf 100644 --- a/src/layer/riscv/convolution_1x1_packnto1.h +++ b/src/layer/riscv/convolution_1x1_packnto1.h @@ -28,7 +28,7 @@ static void conv1x1s1_sgemm_packnto1_rvv(const Mat& bottom_blob, Mat& top_blob, static void conv1x1s2_sgemm_packnto1_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) { const int packn = csrr_vlenb() / 4; - const size_t vl = vsetvl_e32m1(packn); + const size_t vl = __riscv_vsetvl_e32m1(packn); int w = bottom_blob.w; int channels = bottom_blob.c; @@ -53,8 +53,8 @@ static void conv1x1s2_sgemm_packnto1_rvv(const Mat& bottom_blob, Mat& top_blob, { for (int j = 0; j < outw; j++) { - vfloat32m1_t _val = vle32_v_f32m1(r0, vl); - vse32_v_f32m1(outptr, _val, vl); + vfloat32m1_t _val = __riscv_vle32_v_f32m1(r0, vl); + __riscv_vse32_v_f32m1(outptr, _val, vl); r0 += packn * 2; outptr += packn; diff --git a/src/layer/riscv/convolution_1x1_packnto1_fp16s.h b/src/layer/riscv/convolution_1x1_packnto1_fp16s.h index 10591ab27f2..f53ec87249f 100644 --- a/src/layer/riscv/convolution_1x1_packnto1_fp16s.h +++ b/src/layer/riscv/convolution_1x1_packnto1_fp16s.h @@ -28,7 +28,7 @@ static void conv1x1s1_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_blob, Mat& top static void conv1x1s2_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) { const int packn = csrr_vlenb() / 2; - const size_t vl = vsetvl_e16m1(packn); + const size_t vl = __riscv_vsetvl_e16m1(packn); int w = bottom_blob.w; int channels = bottom_blob.c; @@ -53,8 +53,8 @@ static void conv1x1s2_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_blob, Mat& top { for (int j = 0; j < outw; j++) { - vfloat16m1_t _val = vle16_v_f16m1(r0, vl); - vse16_v_f16m1(outptr, _val, vl); + vfloat16m1_t _val = __riscv_vle16_v_f16m1(r0, vl); + __riscv_vse16_v_f16m1(outptr, _val, vl); r0 += packn * 2; outptr += packn; diff --git a/src/layer/riscv/convolution_3x3_pack1ton.h b/src/layer/riscv/convolution_3x3_pack1ton.h index 9adcfb1e263..fe7f8267005 100644 --- a/src/layer/riscv/convolution_3x3_pack1ton.h +++ b/src/layer/riscv/convolution_3x3_pack1ton.h @@ -15,7 +15,7 @@ static void conv3x3s1_pack1ton_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) { const int packn = csrr_vlenb() / 4; - const size_t vl = vsetvl_e32m1(packn); + const size_t vl = __riscv_vsetvl_e32m1(packn); int inch = bottom_blob.c; int outw = top_blob.w; @@ -29,7 +29,7 @@ static void conv3x3s1_pack1ton_rvv(const Mat& bottom_blob, Mat& top_blob, const { Mat out0 = top_blob.channel(p); - vfloat32m1_t _bias0 = bias ? vle32_v_f32m1(bias + p * packn, vl) : vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _bias0 = bias ? __riscv_vle32_v_f32m1(bias + p * packn, vl) : __riscv_vfmv_v_f_f32m1(0.f, vl); out0.fill(_bias0); const float* k0 = kernel.channel(p); @@ -45,15 +45,15 @@ static void conv3x3s1_pack1ton_rvv(const Mat& bottom_blob, Mat& top_blob, const const float* r1 = img0.row(1); const float* r2 = img0.row(2); - vfloat32m1_t _k00 = vle32_v_f32m1(k0, vl); - vfloat32m1_t _k01 = vle32_v_f32m1(k0 + packn, vl); - vfloat32m1_t _k02 = vle32_v_f32m1(k0 + packn * 2, vl); - vfloat32m1_t _k10 = vle32_v_f32m1(k0 + packn * 3, vl); - vfloat32m1_t _k11 = vle32_v_f32m1(k0 + packn * 4, vl); - vfloat32m1_t _k12 = vle32_v_f32m1(k0 + packn * 5, vl); - vfloat32m1_t _k20 = vle32_v_f32m1(k0 + packn * 6, vl); - vfloat32m1_t _k21 = vle32_v_f32m1(k0 + packn * 7, vl); - vfloat32m1_t _k22 = vle32_v_f32m1(k0 + packn * 8, vl); + vfloat32m1_t _k00 = __riscv_vle32_v_f32m1(k0, vl); + vfloat32m1_t _k01 = __riscv_vle32_v_f32m1(k0 + packn, vl); + vfloat32m1_t _k02 = __riscv_vle32_v_f32m1(k0 + packn * 2, vl); + vfloat32m1_t _k10 = __riscv_vle32_v_f32m1(k0 + packn * 3, vl); + vfloat32m1_t _k11 = __riscv_vle32_v_f32m1(k0 + packn * 4, vl); + vfloat32m1_t _k12 = __riscv_vle32_v_f32m1(k0 + packn * 5, vl); + vfloat32m1_t _k20 = __riscv_vle32_v_f32m1(k0 + packn * 6, vl); + vfloat32m1_t _k21 = __riscv_vle32_v_f32m1(k0 + packn * 7, vl); + vfloat32m1_t _k22 = __riscv_vle32_v_f32m1(k0 + packn * 8, vl); int i = 0; for (; i < outh; i++) @@ -61,98 +61,98 @@ static void conv3x3s1_pack1ton_rvv(const Mat& bottom_blob, Mat& top_blob, const int j = 0; for (; j + 7 < outw; j += 8) { - vfloat32m1_t _sum0 = vle32_v_f32m1(outptr0, vl); - vfloat32m1_t _sum1 = vle32_v_f32m1(outptr0 + packn, vl); - vfloat32m1_t _sum2 = vle32_v_f32m1(outptr0 + packn * 2, vl); - vfloat32m1_t _sum3 = vle32_v_f32m1(outptr0 + packn * 3, vl); - vfloat32m1_t _sum4 = vle32_v_f32m1(outptr0 + packn * 4, vl); - vfloat32m1_t _sum5 = vle32_v_f32m1(outptr0 + packn * 5, vl); - vfloat32m1_t _sum6 = vle32_v_f32m1(outptr0 + packn * 6, vl); - vfloat32m1_t _sum7 = vle32_v_f32m1(outptr0 + packn * 7, vl); - - _sum0 = vfmacc_vf_f32m1(_sum0, r0[0], _k00, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r0[1], _k00, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r0[2], _k00, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r0[3], _k00, vl); - _sum4 = vfmacc_vf_f32m1(_sum4, r0[4], _k00, vl); - _sum5 = vfmacc_vf_f32m1(_sum5, r0[5], _k00, vl); - _sum6 = vfmacc_vf_f32m1(_sum6, r0[6], _k00, vl); - _sum7 = vfmacc_vf_f32m1(_sum7, r0[7], _k00, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r0[1], _k01, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r0[2], _k01, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r0[3], _k01, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r0[4], _k01, vl); - _sum4 = vfmacc_vf_f32m1(_sum4, r0[5], _k01, vl); - _sum5 = vfmacc_vf_f32m1(_sum5, r0[6], _k01, vl); - _sum6 = vfmacc_vf_f32m1(_sum6, r0[7], _k01, vl); - _sum7 = vfmacc_vf_f32m1(_sum7, r0[8], _k01, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r0[2], _k02, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r0[3], _k02, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r0[4], _k02, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r0[5], _k02, vl); - _sum4 = vfmacc_vf_f32m1(_sum4, r0[6], _k02, vl); - _sum5 = vfmacc_vf_f32m1(_sum5, r0[7], _k02, vl); - _sum6 = vfmacc_vf_f32m1(_sum6, r0[8], _k02, vl); - _sum7 = vfmacc_vf_f32m1(_sum7, r0[9], _k02, vl); - - _sum0 = vfmacc_vf_f32m1(_sum0, r1[0], _k10, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r1[1], _k10, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r1[2], _k10, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r1[3], _k10, vl); - _sum4 = vfmacc_vf_f32m1(_sum4, r1[4], _k10, vl); - _sum5 = vfmacc_vf_f32m1(_sum5, r1[5], _k10, vl); - _sum6 = vfmacc_vf_f32m1(_sum6, r1[6], _k10, vl); - _sum7 = vfmacc_vf_f32m1(_sum7, r1[7], _k10, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r1[1], _k11, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r1[2], _k11, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r1[3], _k11, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r1[4], _k11, vl); - _sum4 = vfmacc_vf_f32m1(_sum4, r1[5], _k11, vl); - _sum5 = vfmacc_vf_f32m1(_sum5, r1[6], _k11, vl); - _sum6 = vfmacc_vf_f32m1(_sum6, r1[7], _k11, vl); - _sum7 = vfmacc_vf_f32m1(_sum7, r1[8], _k11, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r1[2], _k12, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r1[3], _k12, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r1[4], _k12, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r1[5], _k12, vl); - _sum4 = vfmacc_vf_f32m1(_sum4, r1[6], _k12, vl); - _sum5 = vfmacc_vf_f32m1(_sum5, r1[7], _k12, vl); - _sum6 = vfmacc_vf_f32m1(_sum6, r1[8], _k12, vl); - _sum7 = vfmacc_vf_f32m1(_sum7, r1[9], _k12, vl); - - _sum0 = vfmacc_vf_f32m1(_sum0, r2[0], _k20, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r2[1], _k20, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r2[2], _k20, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r2[3], _k20, vl); - _sum4 = vfmacc_vf_f32m1(_sum4, r2[4], _k20, vl); - _sum5 = vfmacc_vf_f32m1(_sum5, r2[5], _k20, vl); - _sum6 = vfmacc_vf_f32m1(_sum6, r2[6], _k20, vl); - _sum7 = vfmacc_vf_f32m1(_sum7, r2[7], _k20, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r2[1], _k21, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r2[2], _k21, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r2[3], _k21, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r2[4], _k21, vl); - _sum4 = vfmacc_vf_f32m1(_sum4, r2[5], _k21, vl); - _sum5 = vfmacc_vf_f32m1(_sum5, r2[6], _k21, vl); - _sum6 = vfmacc_vf_f32m1(_sum6, r2[7], _k21, vl); - _sum7 = vfmacc_vf_f32m1(_sum7, r2[8], _k21, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r2[2], _k22, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r2[3], _k22, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r2[4], _k22, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r2[5], _k22, vl); - _sum4 = vfmacc_vf_f32m1(_sum4, r2[6], _k22, vl); - _sum5 = vfmacc_vf_f32m1(_sum5, r2[7], _k22, vl); - _sum6 = vfmacc_vf_f32m1(_sum6, r2[8], _k22, vl); - _sum7 = vfmacc_vf_f32m1(_sum7, r2[9], _k22, vl); - - vse32_v_f32m1(outptr0, _sum0, vl); - vse32_v_f32m1(outptr0 + packn, _sum1, vl); - vse32_v_f32m1(outptr0 + packn * 2, _sum2, vl); - vse32_v_f32m1(outptr0 + packn * 3, _sum3, vl); - vse32_v_f32m1(outptr0 + packn * 4, _sum4, vl); - vse32_v_f32m1(outptr0 + packn * 5, _sum5, vl); - vse32_v_f32m1(outptr0 + packn * 6, _sum6, vl); - vse32_v_f32m1(outptr0 + packn * 7, _sum7, vl); + vfloat32m1_t _sum0 = __riscv_vle32_v_f32m1(outptr0, vl); + vfloat32m1_t _sum1 = __riscv_vle32_v_f32m1(outptr0 + packn, vl); + vfloat32m1_t _sum2 = __riscv_vle32_v_f32m1(outptr0 + packn * 2, vl); + vfloat32m1_t _sum3 = __riscv_vle32_v_f32m1(outptr0 + packn * 3, vl); + vfloat32m1_t _sum4 = __riscv_vle32_v_f32m1(outptr0 + packn * 4, vl); + vfloat32m1_t _sum5 = __riscv_vle32_v_f32m1(outptr0 + packn * 5, vl); + vfloat32m1_t _sum6 = __riscv_vle32_v_f32m1(outptr0 + packn * 6, vl); + vfloat32m1_t _sum7 = __riscv_vle32_v_f32m1(outptr0 + packn * 7, vl); + + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r0[0], _k00, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r0[1], _k00, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r0[2], _k00, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r0[3], _k00, vl); + _sum4 = __riscv_vfmacc_vf_f32m1(_sum4, r0[4], _k00, vl); + _sum5 = __riscv_vfmacc_vf_f32m1(_sum5, r0[5], _k00, vl); + _sum6 = __riscv_vfmacc_vf_f32m1(_sum6, r0[6], _k00, vl); + _sum7 = __riscv_vfmacc_vf_f32m1(_sum7, r0[7], _k00, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r0[1], _k01, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r0[2], _k01, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r0[3], _k01, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r0[4], _k01, vl); + _sum4 = __riscv_vfmacc_vf_f32m1(_sum4, r0[5], _k01, vl); + _sum5 = __riscv_vfmacc_vf_f32m1(_sum5, r0[6], _k01, vl); + _sum6 = __riscv_vfmacc_vf_f32m1(_sum6, r0[7], _k01, vl); + _sum7 = __riscv_vfmacc_vf_f32m1(_sum7, r0[8], _k01, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r0[2], _k02, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r0[3], _k02, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r0[4], _k02, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r0[5], _k02, vl); + _sum4 = __riscv_vfmacc_vf_f32m1(_sum4, r0[6], _k02, vl); + _sum5 = __riscv_vfmacc_vf_f32m1(_sum5, r0[7], _k02, vl); + _sum6 = __riscv_vfmacc_vf_f32m1(_sum6, r0[8], _k02, vl); + _sum7 = __riscv_vfmacc_vf_f32m1(_sum7, r0[9], _k02, vl); + + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r1[0], _k10, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r1[1], _k10, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r1[2], _k10, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r1[3], _k10, vl); + _sum4 = __riscv_vfmacc_vf_f32m1(_sum4, r1[4], _k10, vl); + _sum5 = __riscv_vfmacc_vf_f32m1(_sum5, r1[5], _k10, vl); + _sum6 = __riscv_vfmacc_vf_f32m1(_sum6, r1[6], _k10, vl); + _sum7 = __riscv_vfmacc_vf_f32m1(_sum7, r1[7], _k10, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r1[1], _k11, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r1[2], _k11, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r1[3], _k11, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r1[4], _k11, vl); + _sum4 = __riscv_vfmacc_vf_f32m1(_sum4, r1[5], _k11, vl); + _sum5 = __riscv_vfmacc_vf_f32m1(_sum5, r1[6], _k11, vl); + _sum6 = __riscv_vfmacc_vf_f32m1(_sum6, r1[7], _k11, vl); + _sum7 = __riscv_vfmacc_vf_f32m1(_sum7, r1[8], _k11, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r1[2], _k12, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r1[3], _k12, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r1[4], _k12, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r1[5], _k12, vl); + _sum4 = __riscv_vfmacc_vf_f32m1(_sum4, r1[6], _k12, vl); + _sum5 = __riscv_vfmacc_vf_f32m1(_sum5, r1[7], _k12, vl); + _sum6 = __riscv_vfmacc_vf_f32m1(_sum6, r1[8], _k12, vl); + _sum7 = __riscv_vfmacc_vf_f32m1(_sum7, r1[9], _k12, vl); + + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r2[0], _k20, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r2[1], _k20, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r2[2], _k20, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r2[3], _k20, vl); + _sum4 = __riscv_vfmacc_vf_f32m1(_sum4, r2[4], _k20, vl); + _sum5 = __riscv_vfmacc_vf_f32m1(_sum5, r2[5], _k20, vl); + _sum6 = __riscv_vfmacc_vf_f32m1(_sum6, r2[6], _k20, vl); + _sum7 = __riscv_vfmacc_vf_f32m1(_sum7, r2[7], _k20, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r2[1], _k21, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r2[2], _k21, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r2[3], _k21, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r2[4], _k21, vl); + _sum4 = __riscv_vfmacc_vf_f32m1(_sum4, r2[5], _k21, vl); + _sum5 = __riscv_vfmacc_vf_f32m1(_sum5, r2[6], _k21, vl); + _sum6 = __riscv_vfmacc_vf_f32m1(_sum6, r2[7], _k21, vl); + _sum7 = __riscv_vfmacc_vf_f32m1(_sum7, r2[8], _k21, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r2[2], _k22, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r2[3], _k22, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r2[4], _k22, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r2[5], _k22, vl); + _sum4 = __riscv_vfmacc_vf_f32m1(_sum4, r2[6], _k22, vl); + _sum5 = __riscv_vfmacc_vf_f32m1(_sum5, r2[7], _k22, vl); + _sum6 = __riscv_vfmacc_vf_f32m1(_sum6, r2[8], _k22, vl); + _sum7 = __riscv_vfmacc_vf_f32m1(_sum7, r2[9], _k22, vl); + + __riscv_vse32_v_f32m1(outptr0, _sum0, vl); + __riscv_vse32_v_f32m1(outptr0 + packn, _sum1, vl); + __riscv_vse32_v_f32m1(outptr0 + packn * 2, _sum2, vl); + __riscv_vse32_v_f32m1(outptr0 + packn * 3, _sum3, vl); + __riscv_vse32_v_f32m1(outptr0 + packn * 4, _sum4, vl); + __riscv_vse32_v_f32m1(outptr0 + packn * 5, _sum5, vl); + __riscv_vse32_v_f32m1(outptr0 + packn * 6, _sum6, vl); + __riscv_vse32_v_f32m1(outptr0 + packn * 7, _sum7, vl); outptr0 += packn * 8; @@ -162,54 +162,54 @@ static void conv3x3s1_pack1ton_rvv(const Mat& bottom_blob, Mat& top_blob, const } for (; j + 3 < outw; j += 4) { - vfloat32m1_t _sum0 = vle32_v_f32m1(outptr0, vl); - vfloat32m1_t _sum1 = vle32_v_f32m1(outptr0 + packn, vl); - vfloat32m1_t _sum2 = vle32_v_f32m1(outptr0 + packn * 2, vl); - vfloat32m1_t _sum3 = vle32_v_f32m1(outptr0 + packn * 3, vl); - - _sum0 = vfmacc_vf_f32m1(_sum0, r0[0], _k00, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r0[1], _k00, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r0[2], _k00, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r0[3], _k00, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r0[1], _k01, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r0[2], _k01, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r0[3], _k01, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r0[4], _k01, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r0[2], _k02, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r0[3], _k02, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r0[4], _k02, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r0[5], _k02, vl); - - _sum0 = vfmacc_vf_f32m1(_sum0, r1[0], _k10, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r1[1], _k10, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r1[2], _k10, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r1[3], _k10, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r1[1], _k11, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r1[2], _k11, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r1[3], _k11, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r1[4], _k11, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r1[2], _k12, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r1[3], _k12, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r1[4], _k12, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r1[5], _k12, vl); - - _sum0 = vfmacc_vf_f32m1(_sum0, r2[0], _k20, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r2[1], _k20, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r2[2], _k20, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r2[3], _k20, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r2[1], _k21, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r2[2], _k21, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r2[3], _k21, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r2[4], _k21, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r2[2], _k22, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r2[3], _k22, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r2[4], _k22, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r2[5], _k22, vl); - - vse32_v_f32m1(outptr0, _sum0, vl); - vse32_v_f32m1(outptr0 + packn, _sum1, vl); - vse32_v_f32m1(outptr0 + packn * 2, _sum2, vl); - vse32_v_f32m1(outptr0 + packn * 3, _sum3, vl); + vfloat32m1_t _sum0 = __riscv_vle32_v_f32m1(outptr0, vl); + vfloat32m1_t _sum1 = __riscv_vle32_v_f32m1(outptr0 + packn, vl); + vfloat32m1_t _sum2 = __riscv_vle32_v_f32m1(outptr0 + packn * 2, vl); + vfloat32m1_t _sum3 = __riscv_vle32_v_f32m1(outptr0 + packn * 3, vl); + + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r0[0], _k00, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r0[1], _k00, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r0[2], _k00, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r0[3], _k00, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r0[1], _k01, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r0[2], _k01, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r0[3], _k01, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r0[4], _k01, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r0[2], _k02, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r0[3], _k02, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r0[4], _k02, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r0[5], _k02, vl); + + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r1[0], _k10, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r1[1], _k10, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r1[2], _k10, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r1[3], _k10, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r1[1], _k11, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r1[2], _k11, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r1[3], _k11, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r1[4], _k11, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r1[2], _k12, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r1[3], _k12, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r1[4], _k12, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r1[5], _k12, vl); + + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r2[0], _k20, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r2[1], _k20, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r2[2], _k20, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r2[3], _k20, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r2[1], _k21, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r2[2], _k21, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r2[3], _k21, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r2[4], _k21, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r2[2], _k22, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r2[3], _k22, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r2[4], _k22, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r2[5], _k22, vl); + + __riscv_vse32_v_f32m1(outptr0, _sum0, vl); + __riscv_vse32_v_f32m1(outptr0 + packn, _sum1, vl); + __riscv_vse32_v_f32m1(outptr0 + packn * 2, _sum2, vl); + __riscv_vse32_v_f32m1(outptr0 + packn * 3, _sum3, vl); outptr0 += packn * 4; @@ -219,32 +219,32 @@ static void conv3x3s1_pack1ton_rvv(const Mat& bottom_blob, Mat& top_blob, const } for (; j + 1 < outw; j += 2) { - vfloat32m1_t _sum0 = vle32_v_f32m1(outptr0, vl); - vfloat32m1_t _sum1 = vle32_v_f32m1(outptr0 + packn, vl); - - _sum0 = vfmacc_vf_f32m1(_sum0, r0[0], _k00, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r0[1], _k00, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r0[1], _k01, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r0[2], _k01, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r0[2], _k02, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r0[3], _k02, vl); - - _sum0 = vfmacc_vf_f32m1(_sum0, r1[0], _k10, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r1[1], _k10, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r1[1], _k11, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r1[2], _k11, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r1[2], _k12, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r1[3], _k12, vl); - - _sum0 = vfmacc_vf_f32m1(_sum0, r2[0], _k20, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r2[1], _k20, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r2[1], _k21, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r2[2], _k21, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r2[2], _k22, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r2[3], _k22, vl); - - vse32_v_f32m1(outptr0, _sum0, vl); - vse32_v_f32m1(outptr0 + packn, _sum1, vl); + vfloat32m1_t _sum0 = __riscv_vle32_v_f32m1(outptr0, vl); + vfloat32m1_t _sum1 = __riscv_vle32_v_f32m1(outptr0 + packn, vl); + + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r0[0], _k00, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r0[1], _k00, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r0[1], _k01, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r0[2], _k01, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r0[2], _k02, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r0[3], _k02, vl); + + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r1[0], _k10, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r1[1], _k10, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r1[1], _k11, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r1[2], _k11, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r1[2], _k12, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r1[3], _k12, vl); + + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r2[0], _k20, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r2[1], _k20, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r2[1], _k21, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r2[2], _k21, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r2[2], _k22, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r2[3], _k22, vl); + + __riscv_vse32_v_f32m1(outptr0, _sum0, vl); + __riscv_vse32_v_f32m1(outptr0 + packn, _sum1, vl); outptr0 += packn * 2; @@ -254,21 +254,21 @@ static void conv3x3s1_pack1ton_rvv(const Mat& bottom_blob, Mat& top_blob, const } for (; j < outw; j++) { - vfloat32m1_t _sum0 = vle32_v_f32m1(outptr0, vl); + vfloat32m1_t _sum0 = __riscv_vle32_v_f32m1(outptr0, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r0[0], _k00, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r0[1], _k01, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r0[2], _k02, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r0[0], _k00, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r0[1], _k01, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r0[2], _k02, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r1[0], _k10, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r1[1], _k11, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r1[2], _k12, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r1[0], _k10, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r1[1], _k11, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r1[2], _k12, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r2[0], _k20, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r2[1], _k21, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r2[2], _k22, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r2[0], _k20, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r2[1], _k21, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r2[2], _k22, vl); - vse32_v_f32m1(outptr0, _sum0, vl); + __riscv_vse32_v_f32m1(outptr0, _sum0, vl); outptr0 += packn; @@ -290,7 +290,7 @@ static void conv3x3s1_pack1ton_rvv(const Mat& bottom_blob, Mat& top_blob, const static void conv3x3s2_pack1ton_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) { const int packn = csrr_vlenb() / 4; - const size_t vl = vsetvl_e32m1(packn); + const size_t vl = __riscv_vsetvl_e32m1(packn); int w = bottom_blob.w; int inch = bottom_blob.c; @@ -307,7 +307,7 @@ static void conv3x3s2_pack1ton_rvv(const Mat& bottom_blob, Mat& top_blob, const { Mat out0 = top_blob.channel(p); - vfloat32m1_t _bias0 = bias ? vle32_v_f32m1(bias + p * packn, vl) : vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _bias0 = bias ? __riscv_vle32_v_f32m1(bias + p * packn, vl) : __riscv_vfmv_v_f_f32m1(0.f, vl); out0.fill(_bias0); const float* k0 = kernel.channel(p); @@ -323,15 +323,15 @@ static void conv3x3s2_pack1ton_rvv(const Mat& bottom_blob, Mat& top_blob, const const float* r1 = img0.row(1); const float* r2 = img0.row(2); - vfloat32m1_t _k00 = vle32_v_f32m1(k0, vl); - vfloat32m1_t _k01 = vle32_v_f32m1(k0 + packn, vl); - vfloat32m1_t _k02 = vle32_v_f32m1(k0 + packn * 2, vl); - vfloat32m1_t _k10 = vle32_v_f32m1(k0 + packn * 3, vl); - vfloat32m1_t _k11 = vle32_v_f32m1(k0 + packn * 4, vl); - vfloat32m1_t _k12 = vle32_v_f32m1(k0 + packn * 5, vl); - vfloat32m1_t _k20 = vle32_v_f32m1(k0 + packn * 6, vl); - vfloat32m1_t _k21 = vle32_v_f32m1(k0 + packn * 7, vl); - vfloat32m1_t _k22 = vle32_v_f32m1(k0 + packn * 8, vl); + vfloat32m1_t _k00 = __riscv_vle32_v_f32m1(k0, vl); + vfloat32m1_t _k01 = __riscv_vle32_v_f32m1(k0 + packn, vl); + vfloat32m1_t _k02 = __riscv_vle32_v_f32m1(k0 + packn * 2, vl); + vfloat32m1_t _k10 = __riscv_vle32_v_f32m1(k0 + packn * 3, vl); + vfloat32m1_t _k11 = __riscv_vle32_v_f32m1(k0 + packn * 4, vl); + vfloat32m1_t _k12 = __riscv_vle32_v_f32m1(k0 + packn * 5, vl); + vfloat32m1_t _k20 = __riscv_vle32_v_f32m1(k0 + packn * 6, vl); + vfloat32m1_t _k21 = __riscv_vle32_v_f32m1(k0 + packn * 7, vl); + vfloat32m1_t _k22 = __riscv_vle32_v_f32m1(k0 + packn * 8, vl); int i = 0; for (; i < outh; i++) @@ -339,98 +339,98 @@ static void conv3x3s2_pack1ton_rvv(const Mat& bottom_blob, Mat& top_blob, const int j = 0; for (; j + 7 < outw; j += 8) { - vfloat32m1_t _sum0 = vle32_v_f32m1(outptr0, vl); - vfloat32m1_t _sum1 = vle32_v_f32m1(outptr0 + packn, vl); - vfloat32m1_t _sum2 = vle32_v_f32m1(outptr0 + packn * 2, vl); - vfloat32m1_t _sum3 = vle32_v_f32m1(outptr0 + packn * 3, vl); - vfloat32m1_t _sum4 = vle32_v_f32m1(outptr0 + packn * 4, vl); - vfloat32m1_t _sum5 = vle32_v_f32m1(outptr0 + packn * 5, vl); - vfloat32m1_t _sum6 = vle32_v_f32m1(outptr0 + packn * 6, vl); - vfloat32m1_t _sum7 = vle32_v_f32m1(outptr0 + packn * 7, vl); - - _sum0 = vfmacc_vf_f32m1(_sum0, r0[0], _k00, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r0[2], _k00, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r0[4], _k00, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r0[6], _k00, vl); - _sum4 = vfmacc_vf_f32m1(_sum4, r0[8], _k00, vl); - _sum5 = vfmacc_vf_f32m1(_sum5, r0[10], _k00, vl); - _sum6 = vfmacc_vf_f32m1(_sum6, r0[12], _k00, vl); - _sum7 = vfmacc_vf_f32m1(_sum7, r0[14], _k00, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r0[1], _k01, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r0[3], _k01, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r0[5], _k01, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r0[7], _k01, vl); - _sum4 = vfmacc_vf_f32m1(_sum4, r0[9], _k01, vl); - _sum5 = vfmacc_vf_f32m1(_sum5, r0[11], _k01, vl); - _sum6 = vfmacc_vf_f32m1(_sum6, r0[13], _k01, vl); - _sum7 = vfmacc_vf_f32m1(_sum7, r0[15], _k01, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r0[2], _k02, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r0[4], _k02, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r0[6], _k02, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r0[8], _k02, vl); - _sum4 = vfmacc_vf_f32m1(_sum4, r0[10], _k02, vl); - _sum5 = vfmacc_vf_f32m1(_sum5, r0[12], _k02, vl); - _sum6 = vfmacc_vf_f32m1(_sum6, r0[14], _k02, vl); - _sum7 = vfmacc_vf_f32m1(_sum7, r0[16], _k02, vl); - - _sum0 = vfmacc_vf_f32m1(_sum0, r1[0], _k10, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r1[2], _k10, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r1[4], _k10, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r1[6], _k10, vl); - _sum4 = vfmacc_vf_f32m1(_sum4, r1[8], _k10, vl); - _sum5 = vfmacc_vf_f32m1(_sum5, r1[10], _k10, vl); - _sum6 = vfmacc_vf_f32m1(_sum6, r1[12], _k10, vl); - _sum7 = vfmacc_vf_f32m1(_sum7, r1[14], _k10, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r1[1], _k11, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r1[3], _k11, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r1[5], _k11, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r1[7], _k11, vl); - _sum4 = vfmacc_vf_f32m1(_sum4, r1[9], _k11, vl); - _sum5 = vfmacc_vf_f32m1(_sum5, r1[11], _k11, vl); - _sum6 = vfmacc_vf_f32m1(_sum6, r1[13], _k11, vl); - _sum7 = vfmacc_vf_f32m1(_sum7, r1[15], _k11, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r1[2], _k12, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r1[4], _k12, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r1[6], _k12, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r1[8], _k12, vl); - _sum4 = vfmacc_vf_f32m1(_sum4, r1[10], _k12, vl); - _sum5 = vfmacc_vf_f32m1(_sum5, r1[12], _k12, vl); - _sum6 = vfmacc_vf_f32m1(_sum6, r1[14], _k12, vl); - _sum7 = vfmacc_vf_f32m1(_sum7, r1[16], _k12, vl); - - _sum0 = vfmacc_vf_f32m1(_sum0, r2[0], _k20, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r2[2], _k20, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r2[4], _k20, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r2[6], _k20, vl); - _sum4 = vfmacc_vf_f32m1(_sum4, r2[8], _k20, vl); - _sum5 = vfmacc_vf_f32m1(_sum5, r2[10], _k20, vl); - _sum6 = vfmacc_vf_f32m1(_sum6, r2[12], _k20, vl); - _sum7 = vfmacc_vf_f32m1(_sum7, r2[14], _k20, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r2[1], _k21, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r2[3], _k21, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r2[5], _k21, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r2[7], _k21, vl); - _sum4 = vfmacc_vf_f32m1(_sum4, r2[9], _k21, vl); - _sum5 = vfmacc_vf_f32m1(_sum5, r2[11], _k21, vl); - _sum6 = vfmacc_vf_f32m1(_sum6, r2[13], _k21, vl); - _sum7 = vfmacc_vf_f32m1(_sum7, r2[15], _k21, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r2[2], _k22, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r2[4], _k22, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r2[6], _k22, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r2[8], _k22, vl); - _sum4 = vfmacc_vf_f32m1(_sum4, r2[10], _k22, vl); - _sum5 = vfmacc_vf_f32m1(_sum5, r2[12], _k22, vl); - _sum6 = vfmacc_vf_f32m1(_sum6, r2[14], _k22, vl); - _sum7 = vfmacc_vf_f32m1(_sum7, r2[16], _k22, vl); - - vse32_v_f32m1(outptr0, _sum0, vl); - vse32_v_f32m1(outptr0 + packn, _sum1, vl); - vse32_v_f32m1(outptr0 + packn * 2, _sum2, vl); - vse32_v_f32m1(outptr0 + packn * 3, _sum3, vl); - vse32_v_f32m1(outptr0 + packn * 4, _sum4, vl); - vse32_v_f32m1(outptr0 + packn * 5, _sum5, vl); - vse32_v_f32m1(outptr0 + packn * 6, _sum6, vl); - vse32_v_f32m1(outptr0 + packn * 7, _sum7, vl); + vfloat32m1_t _sum0 = __riscv_vle32_v_f32m1(outptr0, vl); + vfloat32m1_t _sum1 = __riscv_vle32_v_f32m1(outptr0 + packn, vl); + vfloat32m1_t _sum2 = __riscv_vle32_v_f32m1(outptr0 + packn * 2, vl); + vfloat32m1_t _sum3 = __riscv_vle32_v_f32m1(outptr0 + packn * 3, vl); + vfloat32m1_t _sum4 = __riscv_vle32_v_f32m1(outptr0 + packn * 4, vl); + vfloat32m1_t _sum5 = __riscv_vle32_v_f32m1(outptr0 + packn * 5, vl); + vfloat32m1_t _sum6 = __riscv_vle32_v_f32m1(outptr0 + packn * 6, vl); + vfloat32m1_t _sum7 = __riscv_vle32_v_f32m1(outptr0 + packn * 7, vl); + + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r0[0], _k00, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r0[2], _k00, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r0[4], _k00, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r0[6], _k00, vl); + _sum4 = __riscv_vfmacc_vf_f32m1(_sum4, r0[8], _k00, vl); + _sum5 = __riscv_vfmacc_vf_f32m1(_sum5, r0[10], _k00, vl); + _sum6 = __riscv_vfmacc_vf_f32m1(_sum6, r0[12], _k00, vl); + _sum7 = __riscv_vfmacc_vf_f32m1(_sum7, r0[14], _k00, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r0[1], _k01, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r0[3], _k01, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r0[5], _k01, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r0[7], _k01, vl); + _sum4 = __riscv_vfmacc_vf_f32m1(_sum4, r0[9], _k01, vl); + _sum5 = __riscv_vfmacc_vf_f32m1(_sum5, r0[11], _k01, vl); + _sum6 = __riscv_vfmacc_vf_f32m1(_sum6, r0[13], _k01, vl); + _sum7 = __riscv_vfmacc_vf_f32m1(_sum7, r0[15], _k01, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r0[2], _k02, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r0[4], _k02, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r0[6], _k02, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r0[8], _k02, vl); + _sum4 = __riscv_vfmacc_vf_f32m1(_sum4, r0[10], _k02, vl); + _sum5 = __riscv_vfmacc_vf_f32m1(_sum5, r0[12], _k02, vl); + _sum6 = __riscv_vfmacc_vf_f32m1(_sum6, r0[14], _k02, vl); + _sum7 = __riscv_vfmacc_vf_f32m1(_sum7, r0[16], _k02, vl); + + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r1[0], _k10, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r1[2], _k10, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r1[4], _k10, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r1[6], _k10, vl); + _sum4 = __riscv_vfmacc_vf_f32m1(_sum4, r1[8], _k10, vl); + _sum5 = __riscv_vfmacc_vf_f32m1(_sum5, r1[10], _k10, vl); + _sum6 = __riscv_vfmacc_vf_f32m1(_sum6, r1[12], _k10, vl); + _sum7 = __riscv_vfmacc_vf_f32m1(_sum7, r1[14], _k10, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r1[1], _k11, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r1[3], _k11, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r1[5], _k11, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r1[7], _k11, vl); + _sum4 = __riscv_vfmacc_vf_f32m1(_sum4, r1[9], _k11, vl); + _sum5 = __riscv_vfmacc_vf_f32m1(_sum5, r1[11], _k11, vl); + _sum6 = __riscv_vfmacc_vf_f32m1(_sum6, r1[13], _k11, vl); + _sum7 = __riscv_vfmacc_vf_f32m1(_sum7, r1[15], _k11, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r1[2], _k12, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r1[4], _k12, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r1[6], _k12, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r1[8], _k12, vl); + _sum4 = __riscv_vfmacc_vf_f32m1(_sum4, r1[10], _k12, vl); + _sum5 = __riscv_vfmacc_vf_f32m1(_sum5, r1[12], _k12, vl); + _sum6 = __riscv_vfmacc_vf_f32m1(_sum6, r1[14], _k12, vl); + _sum7 = __riscv_vfmacc_vf_f32m1(_sum7, r1[16], _k12, vl); + + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r2[0], _k20, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r2[2], _k20, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r2[4], _k20, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r2[6], _k20, vl); + _sum4 = __riscv_vfmacc_vf_f32m1(_sum4, r2[8], _k20, vl); + _sum5 = __riscv_vfmacc_vf_f32m1(_sum5, r2[10], _k20, vl); + _sum6 = __riscv_vfmacc_vf_f32m1(_sum6, r2[12], _k20, vl); + _sum7 = __riscv_vfmacc_vf_f32m1(_sum7, r2[14], _k20, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r2[1], _k21, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r2[3], _k21, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r2[5], _k21, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r2[7], _k21, vl); + _sum4 = __riscv_vfmacc_vf_f32m1(_sum4, r2[9], _k21, vl); + _sum5 = __riscv_vfmacc_vf_f32m1(_sum5, r2[11], _k21, vl); + _sum6 = __riscv_vfmacc_vf_f32m1(_sum6, r2[13], _k21, vl); + _sum7 = __riscv_vfmacc_vf_f32m1(_sum7, r2[15], _k21, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r2[2], _k22, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r2[4], _k22, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r2[6], _k22, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r2[8], _k22, vl); + _sum4 = __riscv_vfmacc_vf_f32m1(_sum4, r2[10], _k22, vl); + _sum5 = __riscv_vfmacc_vf_f32m1(_sum5, r2[12], _k22, vl); + _sum6 = __riscv_vfmacc_vf_f32m1(_sum6, r2[14], _k22, vl); + _sum7 = __riscv_vfmacc_vf_f32m1(_sum7, r2[16], _k22, vl); + + __riscv_vse32_v_f32m1(outptr0, _sum0, vl); + __riscv_vse32_v_f32m1(outptr0 + packn, _sum1, vl); + __riscv_vse32_v_f32m1(outptr0 + packn * 2, _sum2, vl); + __riscv_vse32_v_f32m1(outptr0 + packn * 3, _sum3, vl); + __riscv_vse32_v_f32m1(outptr0 + packn * 4, _sum4, vl); + __riscv_vse32_v_f32m1(outptr0 + packn * 5, _sum5, vl); + __riscv_vse32_v_f32m1(outptr0 + packn * 6, _sum6, vl); + __riscv_vse32_v_f32m1(outptr0 + packn * 7, _sum7, vl); outptr0 += packn * 8; @@ -440,54 +440,54 @@ static void conv3x3s2_pack1ton_rvv(const Mat& bottom_blob, Mat& top_blob, const } for (; j + 3 < outw; j += 4) { - vfloat32m1_t _sum0 = vle32_v_f32m1(outptr0, vl); - vfloat32m1_t _sum1 = vle32_v_f32m1(outptr0 + packn, vl); - vfloat32m1_t _sum2 = vle32_v_f32m1(outptr0 + packn * 2, vl); - vfloat32m1_t _sum3 = vle32_v_f32m1(outptr0 + packn * 3, vl); - - _sum0 = vfmacc_vf_f32m1(_sum0, r0[0], _k00, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r0[2], _k00, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r0[4], _k00, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r0[6], _k00, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r0[1], _k01, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r0[3], _k01, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r0[5], _k01, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r0[7], _k01, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r0[2], _k02, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r0[4], _k02, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r0[6], _k02, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r0[8], _k02, vl); - - _sum0 = vfmacc_vf_f32m1(_sum0, r1[0], _k10, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r1[2], _k10, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r1[4], _k10, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r1[6], _k10, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r1[1], _k11, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r1[3], _k11, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r1[5], _k11, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r1[7], _k11, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r1[2], _k12, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r1[4], _k12, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r1[6], _k12, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r1[8], _k12, vl); - - _sum0 = vfmacc_vf_f32m1(_sum0, r2[0], _k20, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r2[2], _k20, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r2[4], _k20, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r2[6], _k20, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r2[1], _k21, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r2[3], _k21, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r2[5], _k21, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r2[7], _k21, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r2[2], _k22, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r2[4], _k22, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r2[6], _k22, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r2[8], _k22, vl); - - vse32_v_f32m1(outptr0, _sum0, vl); - vse32_v_f32m1(outptr0 + packn, _sum1, vl); - vse32_v_f32m1(outptr0 + packn * 2, _sum2, vl); - vse32_v_f32m1(outptr0 + packn * 3, _sum3, vl); + vfloat32m1_t _sum0 = __riscv_vle32_v_f32m1(outptr0, vl); + vfloat32m1_t _sum1 = __riscv_vle32_v_f32m1(outptr0 + packn, vl); + vfloat32m1_t _sum2 = __riscv_vle32_v_f32m1(outptr0 + packn * 2, vl); + vfloat32m1_t _sum3 = __riscv_vle32_v_f32m1(outptr0 + packn * 3, vl); + + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r0[0], _k00, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r0[2], _k00, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r0[4], _k00, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r0[6], _k00, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r0[1], _k01, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r0[3], _k01, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r0[5], _k01, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r0[7], _k01, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r0[2], _k02, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r0[4], _k02, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r0[6], _k02, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r0[8], _k02, vl); + + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r1[0], _k10, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r1[2], _k10, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r1[4], _k10, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r1[6], _k10, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r1[1], _k11, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r1[3], _k11, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r1[5], _k11, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r1[7], _k11, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r1[2], _k12, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r1[4], _k12, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r1[6], _k12, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r1[8], _k12, vl); + + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r2[0], _k20, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r2[2], _k20, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r2[4], _k20, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r2[6], _k20, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r2[1], _k21, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r2[3], _k21, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r2[5], _k21, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r2[7], _k21, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r2[2], _k22, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r2[4], _k22, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r2[6], _k22, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r2[8], _k22, vl); + + __riscv_vse32_v_f32m1(outptr0, _sum0, vl); + __riscv_vse32_v_f32m1(outptr0 + packn, _sum1, vl); + __riscv_vse32_v_f32m1(outptr0 + packn * 2, _sum2, vl); + __riscv_vse32_v_f32m1(outptr0 + packn * 3, _sum3, vl); outptr0 += packn * 4; @@ -497,32 +497,32 @@ static void conv3x3s2_pack1ton_rvv(const Mat& bottom_blob, Mat& top_blob, const } for (; j + 1 < outw; j += 2) { - vfloat32m1_t _sum0 = vle32_v_f32m1(outptr0, vl); - vfloat32m1_t _sum1 = vle32_v_f32m1(outptr0 + packn, vl); - - _sum0 = vfmacc_vf_f32m1(_sum0, r0[0], _k00, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r0[2], _k00, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r0[1], _k01, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r0[3], _k01, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r0[2], _k02, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r0[4], _k02, vl); - - _sum0 = vfmacc_vf_f32m1(_sum0, r1[0], _k10, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r1[2], _k10, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r1[1], _k11, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r1[3], _k11, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r1[2], _k12, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r1[4], _k12, vl); - - _sum0 = vfmacc_vf_f32m1(_sum0, r2[0], _k20, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r2[2], _k20, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r2[1], _k21, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r2[3], _k21, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r2[2], _k22, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r2[4], _k22, vl); - - vse32_v_f32m1(outptr0, _sum0, vl); - vse32_v_f32m1(outptr0 + packn, _sum1, vl); + vfloat32m1_t _sum0 = __riscv_vle32_v_f32m1(outptr0, vl); + vfloat32m1_t _sum1 = __riscv_vle32_v_f32m1(outptr0 + packn, vl); + + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r0[0], _k00, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r0[2], _k00, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r0[1], _k01, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r0[3], _k01, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r0[2], _k02, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r0[4], _k02, vl); + + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r1[0], _k10, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r1[2], _k10, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r1[1], _k11, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r1[3], _k11, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r1[2], _k12, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r1[4], _k12, vl); + + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r2[0], _k20, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r2[2], _k20, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r2[1], _k21, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r2[3], _k21, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r2[2], _k22, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r2[4], _k22, vl); + + __riscv_vse32_v_f32m1(outptr0, _sum0, vl); + __riscv_vse32_v_f32m1(outptr0 + packn, _sum1, vl); outptr0 += packn * 2; @@ -532,21 +532,21 @@ static void conv3x3s2_pack1ton_rvv(const Mat& bottom_blob, Mat& top_blob, const } for (; j < outw; j++) { - vfloat32m1_t _sum0 = vle32_v_f32m1(outptr0, vl); + vfloat32m1_t _sum0 = __riscv_vle32_v_f32m1(outptr0, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r0[0], _k00, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r0[1], _k01, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r0[2], _k02, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r0[0], _k00, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r0[1], _k01, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r0[2], _k02, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r1[0], _k10, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r1[1], _k11, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r1[2], _k12, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r1[0], _k10, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r1[1], _k11, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r1[2], _k12, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r2[0], _k20, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r2[1], _k21, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r2[2], _k22, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r2[0], _k20, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r2[1], _k21, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r2[2], _k22, vl); - vse32_v_f32m1(outptr0, _sum0, vl); + __riscv_vse32_v_f32m1(outptr0, _sum0, vl); outptr0 += packn; diff --git a/src/layer/riscv/convolution_3x3_pack1ton_fp16s.h b/src/layer/riscv/convolution_3x3_pack1ton_fp16s.h index bff24a0099f..100751d6d30 100644 --- a/src/layer/riscv/convolution_3x3_pack1ton_fp16s.h +++ b/src/layer/riscv/convolution_3x3_pack1ton_fp16s.h @@ -15,7 +15,7 @@ static void conv3x3s1_pack1ton_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) { const int packn = csrr_vlenb() / 2; - const size_t vl = vsetvl_e16m1(packn); + const size_t vl = __riscv_vsetvl_e16m1(packn); int inch = bottom_blob.c; int outw = top_blob.w; @@ -29,7 +29,7 @@ static void conv3x3s1_pack1ton_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, { Mat out0 = top_blob.channel(p); - vfloat16m1_t _bias0 = bias ? vle16_v_f16m1(bias + p * packn, vl) : vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _bias0 = bias ? __riscv_vle16_v_f16m1(bias + p * packn, vl) : __riscv_vfmv_v_f_f16m1(0.f, vl); out0.fill(_bias0); const __fp16* k0 = kernel.channel(p); @@ -45,15 +45,15 @@ static void conv3x3s1_pack1ton_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, const __fp16* r1 = img0.row(1); const __fp16* r2 = img0.row(2); - vfloat16m1_t _k00 = vle16_v_f16m1(k0, vl); - vfloat16m1_t _k01 = vle16_v_f16m1(k0 + packn, vl); - vfloat16m1_t _k02 = vle16_v_f16m1(k0 + packn * 2, vl); - vfloat16m1_t _k10 = vle16_v_f16m1(k0 + packn * 3, vl); - vfloat16m1_t _k11 = vle16_v_f16m1(k0 + packn * 4, vl); - vfloat16m1_t _k12 = vle16_v_f16m1(k0 + packn * 5, vl); - vfloat16m1_t _k20 = vle16_v_f16m1(k0 + packn * 6, vl); - vfloat16m1_t _k21 = vle16_v_f16m1(k0 + packn * 7, vl); - vfloat16m1_t _k22 = vle16_v_f16m1(k0 + packn * 8, vl); + vfloat16m1_t _k00 = __riscv_vle16_v_f16m1(k0, vl); + vfloat16m1_t _k01 = __riscv_vle16_v_f16m1(k0 + packn, vl); + vfloat16m1_t _k02 = __riscv_vle16_v_f16m1(k0 + packn * 2, vl); + vfloat16m1_t _k10 = __riscv_vle16_v_f16m1(k0 + packn * 3, vl); + vfloat16m1_t _k11 = __riscv_vle16_v_f16m1(k0 + packn * 4, vl); + vfloat16m1_t _k12 = __riscv_vle16_v_f16m1(k0 + packn * 5, vl); + vfloat16m1_t _k20 = __riscv_vle16_v_f16m1(k0 + packn * 6, vl); + vfloat16m1_t _k21 = __riscv_vle16_v_f16m1(k0 + packn * 7, vl); + vfloat16m1_t _k22 = __riscv_vle16_v_f16m1(k0 + packn * 8, vl); int i = 0; for (; i < outh; i++) @@ -61,98 +61,98 @@ static void conv3x3s1_pack1ton_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, int j = 0; for (; j + 7 < outw; j += 8) { - vfloat16m1_t _sum0 = vle16_v_f16m1(outptr0, vl); - vfloat16m1_t _sum1 = vle16_v_f16m1(outptr0 + packn, vl); - vfloat16m1_t _sum2 = vle16_v_f16m1(outptr0 + packn * 2, vl); - vfloat16m1_t _sum3 = vle16_v_f16m1(outptr0 + packn * 3, vl); - vfloat16m1_t _sum4 = vle16_v_f16m1(outptr0 + packn * 4, vl); - vfloat16m1_t _sum5 = vle16_v_f16m1(outptr0 + packn * 5, vl); - vfloat16m1_t _sum6 = vle16_v_f16m1(outptr0 + packn * 6, vl); - vfloat16m1_t _sum7 = vle16_v_f16m1(outptr0 + packn * 7, vl); - - _sum0 = vfmacc_vf_f16m1(_sum0, r0[0], _k00, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r0[1], _k00, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r0[2], _k00, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r0[3], _k00, vl); - _sum4 = vfmacc_vf_f16m1(_sum4, r0[4], _k00, vl); - _sum5 = vfmacc_vf_f16m1(_sum5, r0[5], _k00, vl); - _sum6 = vfmacc_vf_f16m1(_sum6, r0[6], _k00, vl); - _sum7 = vfmacc_vf_f16m1(_sum7, r0[7], _k00, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r0[1], _k01, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r0[2], _k01, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r0[3], _k01, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r0[4], _k01, vl); - _sum4 = vfmacc_vf_f16m1(_sum4, r0[5], _k01, vl); - _sum5 = vfmacc_vf_f16m1(_sum5, r0[6], _k01, vl); - _sum6 = vfmacc_vf_f16m1(_sum6, r0[7], _k01, vl); - _sum7 = vfmacc_vf_f16m1(_sum7, r0[8], _k01, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r0[2], _k02, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r0[3], _k02, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r0[4], _k02, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r0[5], _k02, vl); - _sum4 = vfmacc_vf_f16m1(_sum4, r0[6], _k02, vl); - _sum5 = vfmacc_vf_f16m1(_sum5, r0[7], _k02, vl); - _sum6 = vfmacc_vf_f16m1(_sum6, r0[8], _k02, vl); - _sum7 = vfmacc_vf_f16m1(_sum7, r0[9], _k02, vl); - - _sum0 = vfmacc_vf_f16m1(_sum0, r1[0], _k10, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r1[1], _k10, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r1[2], _k10, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r1[3], _k10, vl); - _sum4 = vfmacc_vf_f16m1(_sum4, r1[4], _k10, vl); - _sum5 = vfmacc_vf_f16m1(_sum5, r1[5], _k10, vl); - _sum6 = vfmacc_vf_f16m1(_sum6, r1[6], _k10, vl); - _sum7 = vfmacc_vf_f16m1(_sum7, r1[7], _k10, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r1[1], _k11, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r1[2], _k11, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r1[3], _k11, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r1[4], _k11, vl); - _sum4 = vfmacc_vf_f16m1(_sum4, r1[5], _k11, vl); - _sum5 = vfmacc_vf_f16m1(_sum5, r1[6], _k11, vl); - _sum6 = vfmacc_vf_f16m1(_sum6, r1[7], _k11, vl); - _sum7 = vfmacc_vf_f16m1(_sum7, r1[8], _k11, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r1[2], _k12, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r1[3], _k12, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r1[4], _k12, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r1[5], _k12, vl); - _sum4 = vfmacc_vf_f16m1(_sum4, r1[6], _k12, vl); - _sum5 = vfmacc_vf_f16m1(_sum5, r1[7], _k12, vl); - _sum6 = vfmacc_vf_f16m1(_sum6, r1[8], _k12, vl); - _sum7 = vfmacc_vf_f16m1(_sum7, r1[9], _k12, vl); - - _sum0 = vfmacc_vf_f16m1(_sum0, r2[0], _k20, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r2[1], _k20, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r2[2], _k20, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r2[3], _k20, vl); - _sum4 = vfmacc_vf_f16m1(_sum4, r2[4], _k20, vl); - _sum5 = vfmacc_vf_f16m1(_sum5, r2[5], _k20, vl); - _sum6 = vfmacc_vf_f16m1(_sum6, r2[6], _k20, vl); - _sum7 = vfmacc_vf_f16m1(_sum7, r2[7], _k20, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r2[1], _k21, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r2[2], _k21, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r2[3], _k21, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r2[4], _k21, vl); - _sum4 = vfmacc_vf_f16m1(_sum4, r2[5], _k21, vl); - _sum5 = vfmacc_vf_f16m1(_sum5, r2[6], _k21, vl); - _sum6 = vfmacc_vf_f16m1(_sum6, r2[7], _k21, vl); - _sum7 = vfmacc_vf_f16m1(_sum7, r2[8], _k21, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r2[2], _k22, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r2[3], _k22, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r2[4], _k22, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r2[5], _k22, vl); - _sum4 = vfmacc_vf_f16m1(_sum4, r2[6], _k22, vl); - _sum5 = vfmacc_vf_f16m1(_sum5, r2[7], _k22, vl); - _sum6 = vfmacc_vf_f16m1(_sum6, r2[8], _k22, vl); - _sum7 = vfmacc_vf_f16m1(_sum7, r2[9], _k22, vl); - - vse16_v_f16m1(outptr0, _sum0, vl); - vse16_v_f16m1(outptr0 + packn, _sum1, vl); - vse16_v_f16m1(outptr0 + packn * 2, _sum2, vl); - vse16_v_f16m1(outptr0 + packn * 3, _sum3, vl); - vse16_v_f16m1(outptr0 + packn * 4, _sum4, vl); - vse16_v_f16m1(outptr0 + packn * 5, _sum5, vl); - vse16_v_f16m1(outptr0 + packn * 6, _sum6, vl); - vse16_v_f16m1(outptr0 + packn * 7, _sum7, vl); + vfloat16m1_t _sum0 = __riscv_vle16_v_f16m1(outptr0, vl); + vfloat16m1_t _sum1 = __riscv_vle16_v_f16m1(outptr0 + packn, vl); + vfloat16m1_t _sum2 = __riscv_vle16_v_f16m1(outptr0 + packn * 2, vl); + vfloat16m1_t _sum3 = __riscv_vle16_v_f16m1(outptr0 + packn * 3, vl); + vfloat16m1_t _sum4 = __riscv_vle16_v_f16m1(outptr0 + packn * 4, vl); + vfloat16m1_t _sum5 = __riscv_vle16_v_f16m1(outptr0 + packn * 5, vl); + vfloat16m1_t _sum6 = __riscv_vle16_v_f16m1(outptr0 + packn * 6, vl); + vfloat16m1_t _sum7 = __riscv_vle16_v_f16m1(outptr0 + packn * 7, vl); + + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r0[0], _k00, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r0[1], _k00, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r0[2], _k00, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r0[3], _k00, vl); + _sum4 = __riscv_vfmacc_vf_f16m1(_sum4, r0[4], _k00, vl); + _sum5 = __riscv_vfmacc_vf_f16m1(_sum5, r0[5], _k00, vl); + _sum6 = __riscv_vfmacc_vf_f16m1(_sum6, r0[6], _k00, vl); + _sum7 = __riscv_vfmacc_vf_f16m1(_sum7, r0[7], _k00, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r0[1], _k01, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r0[2], _k01, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r0[3], _k01, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r0[4], _k01, vl); + _sum4 = __riscv_vfmacc_vf_f16m1(_sum4, r0[5], _k01, vl); + _sum5 = __riscv_vfmacc_vf_f16m1(_sum5, r0[6], _k01, vl); + _sum6 = __riscv_vfmacc_vf_f16m1(_sum6, r0[7], _k01, vl); + _sum7 = __riscv_vfmacc_vf_f16m1(_sum7, r0[8], _k01, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r0[2], _k02, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r0[3], _k02, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r0[4], _k02, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r0[5], _k02, vl); + _sum4 = __riscv_vfmacc_vf_f16m1(_sum4, r0[6], _k02, vl); + _sum5 = __riscv_vfmacc_vf_f16m1(_sum5, r0[7], _k02, vl); + _sum6 = __riscv_vfmacc_vf_f16m1(_sum6, r0[8], _k02, vl); + _sum7 = __riscv_vfmacc_vf_f16m1(_sum7, r0[9], _k02, vl); + + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r1[0], _k10, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r1[1], _k10, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r1[2], _k10, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r1[3], _k10, vl); + _sum4 = __riscv_vfmacc_vf_f16m1(_sum4, r1[4], _k10, vl); + _sum5 = __riscv_vfmacc_vf_f16m1(_sum5, r1[5], _k10, vl); + _sum6 = __riscv_vfmacc_vf_f16m1(_sum6, r1[6], _k10, vl); + _sum7 = __riscv_vfmacc_vf_f16m1(_sum7, r1[7], _k10, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r1[1], _k11, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r1[2], _k11, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r1[3], _k11, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r1[4], _k11, vl); + _sum4 = __riscv_vfmacc_vf_f16m1(_sum4, r1[5], _k11, vl); + _sum5 = __riscv_vfmacc_vf_f16m1(_sum5, r1[6], _k11, vl); + _sum6 = __riscv_vfmacc_vf_f16m1(_sum6, r1[7], _k11, vl); + _sum7 = __riscv_vfmacc_vf_f16m1(_sum7, r1[8], _k11, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r1[2], _k12, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r1[3], _k12, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r1[4], _k12, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r1[5], _k12, vl); + _sum4 = __riscv_vfmacc_vf_f16m1(_sum4, r1[6], _k12, vl); + _sum5 = __riscv_vfmacc_vf_f16m1(_sum5, r1[7], _k12, vl); + _sum6 = __riscv_vfmacc_vf_f16m1(_sum6, r1[8], _k12, vl); + _sum7 = __riscv_vfmacc_vf_f16m1(_sum7, r1[9], _k12, vl); + + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r2[0], _k20, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r2[1], _k20, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r2[2], _k20, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r2[3], _k20, vl); + _sum4 = __riscv_vfmacc_vf_f16m1(_sum4, r2[4], _k20, vl); + _sum5 = __riscv_vfmacc_vf_f16m1(_sum5, r2[5], _k20, vl); + _sum6 = __riscv_vfmacc_vf_f16m1(_sum6, r2[6], _k20, vl); + _sum7 = __riscv_vfmacc_vf_f16m1(_sum7, r2[7], _k20, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r2[1], _k21, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r2[2], _k21, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r2[3], _k21, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r2[4], _k21, vl); + _sum4 = __riscv_vfmacc_vf_f16m1(_sum4, r2[5], _k21, vl); + _sum5 = __riscv_vfmacc_vf_f16m1(_sum5, r2[6], _k21, vl); + _sum6 = __riscv_vfmacc_vf_f16m1(_sum6, r2[7], _k21, vl); + _sum7 = __riscv_vfmacc_vf_f16m1(_sum7, r2[8], _k21, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r2[2], _k22, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r2[3], _k22, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r2[4], _k22, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r2[5], _k22, vl); + _sum4 = __riscv_vfmacc_vf_f16m1(_sum4, r2[6], _k22, vl); + _sum5 = __riscv_vfmacc_vf_f16m1(_sum5, r2[7], _k22, vl); + _sum6 = __riscv_vfmacc_vf_f16m1(_sum6, r2[8], _k22, vl); + _sum7 = __riscv_vfmacc_vf_f16m1(_sum7, r2[9], _k22, vl); + + __riscv_vse16_v_f16m1(outptr0, _sum0, vl); + __riscv_vse16_v_f16m1(outptr0 + packn, _sum1, vl); + __riscv_vse16_v_f16m1(outptr0 + packn * 2, _sum2, vl); + __riscv_vse16_v_f16m1(outptr0 + packn * 3, _sum3, vl); + __riscv_vse16_v_f16m1(outptr0 + packn * 4, _sum4, vl); + __riscv_vse16_v_f16m1(outptr0 + packn * 5, _sum5, vl); + __riscv_vse16_v_f16m1(outptr0 + packn * 6, _sum6, vl); + __riscv_vse16_v_f16m1(outptr0 + packn * 7, _sum7, vl); outptr0 += packn * 8; @@ -162,54 +162,54 @@ static void conv3x3s1_pack1ton_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, } for (; j + 3 < outw; j += 4) { - vfloat16m1_t _sum0 = vle16_v_f16m1(outptr0, vl); - vfloat16m1_t _sum1 = vle16_v_f16m1(outptr0 + packn, vl); - vfloat16m1_t _sum2 = vle16_v_f16m1(outptr0 + packn * 2, vl); - vfloat16m1_t _sum3 = vle16_v_f16m1(outptr0 + packn * 3, vl); - - _sum0 = vfmacc_vf_f16m1(_sum0, r0[0], _k00, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r0[1], _k00, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r0[2], _k00, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r0[3], _k00, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r0[1], _k01, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r0[2], _k01, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r0[3], _k01, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r0[4], _k01, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r0[2], _k02, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r0[3], _k02, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r0[4], _k02, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r0[5], _k02, vl); - - _sum0 = vfmacc_vf_f16m1(_sum0, r1[0], _k10, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r1[1], _k10, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r1[2], _k10, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r1[3], _k10, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r1[1], _k11, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r1[2], _k11, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r1[3], _k11, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r1[4], _k11, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r1[2], _k12, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r1[3], _k12, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r1[4], _k12, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r1[5], _k12, vl); - - _sum0 = vfmacc_vf_f16m1(_sum0, r2[0], _k20, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r2[1], _k20, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r2[2], _k20, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r2[3], _k20, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r2[1], _k21, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r2[2], _k21, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r2[3], _k21, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r2[4], _k21, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r2[2], _k22, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r2[3], _k22, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r2[4], _k22, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r2[5], _k22, vl); - - vse16_v_f16m1(outptr0, _sum0, vl); - vse16_v_f16m1(outptr0 + packn, _sum1, vl); - vse16_v_f16m1(outptr0 + packn * 2, _sum2, vl); - vse16_v_f16m1(outptr0 + packn * 3, _sum3, vl); + vfloat16m1_t _sum0 = __riscv_vle16_v_f16m1(outptr0, vl); + vfloat16m1_t _sum1 = __riscv_vle16_v_f16m1(outptr0 + packn, vl); + vfloat16m1_t _sum2 = __riscv_vle16_v_f16m1(outptr0 + packn * 2, vl); + vfloat16m1_t _sum3 = __riscv_vle16_v_f16m1(outptr0 + packn * 3, vl); + + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r0[0], _k00, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r0[1], _k00, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r0[2], _k00, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r0[3], _k00, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r0[1], _k01, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r0[2], _k01, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r0[3], _k01, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r0[4], _k01, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r0[2], _k02, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r0[3], _k02, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r0[4], _k02, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r0[5], _k02, vl); + + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r1[0], _k10, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r1[1], _k10, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r1[2], _k10, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r1[3], _k10, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r1[1], _k11, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r1[2], _k11, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r1[3], _k11, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r1[4], _k11, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r1[2], _k12, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r1[3], _k12, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r1[4], _k12, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r1[5], _k12, vl); + + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r2[0], _k20, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r2[1], _k20, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r2[2], _k20, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r2[3], _k20, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r2[1], _k21, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r2[2], _k21, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r2[3], _k21, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r2[4], _k21, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r2[2], _k22, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r2[3], _k22, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r2[4], _k22, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r2[5], _k22, vl); + + __riscv_vse16_v_f16m1(outptr0, _sum0, vl); + __riscv_vse16_v_f16m1(outptr0 + packn, _sum1, vl); + __riscv_vse16_v_f16m1(outptr0 + packn * 2, _sum2, vl); + __riscv_vse16_v_f16m1(outptr0 + packn * 3, _sum3, vl); outptr0 += packn * 4; @@ -219,32 +219,32 @@ static void conv3x3s1_pack1ton_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, } for (; j + 1 < outw; j += 2) { - vfloat16m1_t _sum0 = vle16_v_f16m1(outptr0, vl); - vfloat16m1_t _sum1 = vle16_v_f16m1(outptr0 + packn, vl); - - _sum0 = vfmacc_vf_f16m1(_sum0, r0[0], _k00, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r0[1], _k00, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r0[1], _k01, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r0[2], _k01, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r0[2], _k02, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r0[3], _k02, vl); - - _sum0 = vfmacc_vf_f16m1(_sum0, r1[0], _k10, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r1[1], _k10, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r1[1], _k11, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r1[2], _k11, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r1[2], _k12, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r1[3], _k12, vl); - - _sum0 = vfmacc_vf_f16m1(_sum0, r2[0], _k20, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r2[1], _k20, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r2[1], _k21, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r2[2], _k21, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r2[2], _k22, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r2[3], _k22, vl); - - vse16_v_f16m1(outptr0, _sum0, vl); - vse16_v_f16m1(outptr0 + packn, _sum1, vl); + vfloat16m1_t _sum0 = __riscv_vle16_v_f16m1(outptr0, vl); + vfloat16m1_t _sum1 = __riscv_vle16_v_f16m1(outptr0 + packn, vl); + + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r0[0], _k00, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r0[1], _k00, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r0[1], _k01, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r0[2], _k01, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r0[2], _k02, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r0[3], _k02, vl); + + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r1[0], _k10, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r1[1], _k10, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r1[1], _k11, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r1[2], _k11, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r1[2], _k12, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r1[3], _k12, vl); + + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r2[0], _k20, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r2[1], _k20, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r2[1], _k21, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r2[2], _k21, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r2[2], _k22, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r2[3], _k22, vl); + + __riscv_vse16_v_f16m1(outptr0, _sum0, vl); + __riscv_vse16_v_f16m1(outptr0 + packn, _sum1, vl); outptr0 += packn * 2; @@ -254,21 +254,21 @@ static void conv3x3s1_pack1ton_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, } for (; j < outw; j++) { - vfloat16m1_t _sum0 = vle16_v_f16m1(outptr0, vl); + vfloat16m1_t _sum0 = __riscv_vle16_v_f16m1(outptr0, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r0[0], _k00, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r0[1], _k01, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r0[2], _k02, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r0[0], _k00, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r0[1], _k01, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r0[2], _k02, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r1[0], _k10, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r1[1], _k11, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r1[2], _k12, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r1[0], _k10, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r1[1], _k11, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r1[2], _k12, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r2[0], _k20, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r2[1], _k21, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r2[2], _k22, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r2[0], _k20, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r2[1], _k21, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r2[2], _k22, vl); - vse16_v_f16m1(outptr0, _sum0, vl); + __riscv_vse16_v_f16m1(outptr0, _sum0, vl); outptr0 += packn; @@ -290,7 +290,7 @@ static void conv3x3s1_pack1ton_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, static void conv3x3s2_pack1ton_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) { const int packn = csrr_vlenb() / 2; - const size_t vl = vsetvl_e16m1(packn); + const size_t vl = __riscv_vsetvl_e16m1(packn); int w = bottom_blob.w; int inch = bottom_blob.c; @@ -307,7 +307,7 @@ static void conv3x3s2_pack1ton_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, { Mat out0 = top_blob.channel(p); - vfloat16m1_t _bias0 = bias ? vle16_v_f16m1(bias + p * packn, vl) : vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _bias0 = bias ? __riscv_vle16_v_f16m1(bias + p * packn, vl) : __riscv_vfmv_v_f_f16m1(0.f, vl); out0.fill(_bias0); const __fp16* k0 = kernel.channel(p); @@ -323,15 +323,15 @@ static void conv3x3s2_pack1ton_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, const __fp16* r1 = img0.row(1); const __fp16* r2 = img0.row(2); - vfloat16m1_t _k00 = vle16_v_f16m1(k0, vl); - vfloat16m1_t _k01 = vle16_v_f16m1(k0 + packn, vl); - vfloat16m1_t _k02 = vle16_v_f16m1(k0 + packn * 2, vl); - vfloat16m1_t _k10 = vle16_v_f16m1(k0 + packn * 3, vl); - vfloat16m1_t _k11 = vle16_v_f16m1(k0 + packn * 4, vl); - vfloat16m1_t _k12 = vle16_v_f16m1(k0 + packn * 5, vl); - vfloat16m1_t _k20 = vle16_v_f16m1(k0 + packn * 6, vl); - vfloat16m1_t _k21 = vle16_v_f16m1(k0 + packn * 7, vl); - vfloat16m1_t _k22 = vle16_v_f16m1(k0 + packn * 8, vl); + vfloat16m1_t _k00 = __riscv_vle16_v_f16m1(k0, vl); + vfloat16m1_t _k01 = __riscv_vle16_v_f16m1(k0 + packn, vl); + vfloat16m1_t _k02 = __riscv_vle16_v_f16m1(k0 + packn * 2, vl); + vfloat16m1_t _k10 = __riscv_vle16_v_f16m1(k0 + packn * 3, vl); + vfloat16m1_t _k11 = __riscv_vle16_v_f16m1(k0 + packn * 4, vl); + vfloat16m1_t _k12 = __riscv_vle16_v_f16m1(k0 + packn * 5, vl); + vfloat16m1_t _k20 = __riscv_vle16_v_f16m1(k0 + packn * 6, vl); + vfloat16m1_t _k21 = __riscv_vle16_v_f16m1(k0 + packn * 7, vl); + vfloat16m1_t _k22 = __riscv_vle16_v_f16m1(k0 + packn * 8, vl); int i = 0; for (; i < outh; i++) @@ -339,98 +339,98 @@ static void conv3x3s2_pack1ton_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, int j = 0; for (; j + 7 < outw; j += 8) { - vfloat16m1_t _sum0 = vle16_v_f16m1(outptr0, vl); - vfloat16m1_t _sum1 = vle16_v_f16m1(outptr0 + packn, vl); - vfloat16m1_t _sum2 = vle16_v_f16m1(outptr0 + packn * 2, vl); - vfloat16m1_t _sum3 = vle16_v_f16m1(outptr0 + packn * 3, vl); - vfloat16m1_t _sum4 = vle16_v_f16m1(outptr0 + packn * 4, vl); - vfloat16m1_t _sum5 = vle16_v_f16m1(outptr0 + packn * 5, vl); - vfloat16m1_t _sum6 = vle16_v_f16m1(outptr0 + packn * 6, vl); - vfloat16m1_t _sum7 = vle16_v_f16m1(outptr0 + packn * 7, vl); - - _sum0 = vfmacc_vf_f16m1(_sum0, r0[0], _k00, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r0[2], _k00, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r0[4], _k00, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r0[6], _k00, vl); - _sum4 = vfmacc_vf_f16m1(_sum4, r0[8], _k00, vl); - _sum5 = vfmacc_vf_f16m1(_sum5, r0[10], _k00, vl); - _sum6 = vfmacc_vf_f16m1(_sum6, r0[12], _k00, vl); - _sum7 = vfmacc_vf_f16m1(_sum7, r0[14], _k00, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r0[1], _k01, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r0[3], _k01, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r0[5], _k01, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r0[7], _k01, vl); - _sum4 = vfmacc_vf_f16m1(_sum4, r0[9], _k01, vl); - _sum5 = vfmacc_vf_f16m1(_sum5, r0[11], _k01, vl); - _sum6 = vfmacc_vf_f16m1(_sum6, r0[13], _k01, vl); - _sum7 = vfmacc_vf_f16m1(_sum7, r0[15], _k01, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r0[2], _k02, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r0[4], _k02, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r0[6], _k02, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r0[8], _k02, vl); - _sum4 = vfmacc_vf_f16m1(_sum4, r0[10], _k02, vl); - _sum5 = vfmacc_vf_f16m1(_sum5, r0[12], _k02, vl); - _sum6 = vfmacc_vf_f16m1(_sum6, r0[14], _k02, vl); - _sum7 = vfmacc_vf_f16m1(_sum7, r0[16], _k02, vl); - - _sum0 = vfmacc_vf_f16m1(_sum0, r1[0], _k10, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r1[2], _k10, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r1[4], _k10, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r1[6], _k10, vl); - _sum4 = vfmacc_vf_f16m1(_sum4, r1[8], _k10, vl); - _sum5 = vfmacc_vf_f16m1(_sum5, r1[10], _k10, vl); - _sum6 = vfmacc_vf_f16m1(_sum6, r1[12], _k10, vl); - _sum7 = vfmacc_vf_f16m1(_sum7, r1[14], _k10, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r1[1], _k11, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r1[3], _k11, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r1[5], _k11, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r1[7], _k11, vl); - _sum4 = vfmacc_vf_f16m1(_sum4, r1[9], _k11, vl); - _sum5 = vfmacc_vf_f16m1(_sum5, r1[11], _k11, vl); - _sum6 = vfmacc_vf_f16m1(_sum6, r1[13], _k11, vl); - _sum7 = vfmacc_vf_f16m1(_sum7, r1[15], _k11, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r1[2], _k12, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r1[4], _k12, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r1[6], _k12, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r1[8], _k12, vl); - _sum4 = vfmacc_vf_f16m1(_sum4, r1[10], _k12, vl); - _sum5 = vfmacc_vf_f16m1(_sum5, r1[12], _k12, vl); - _sum6 = vfmacc_vf_f16m1(_sum6, r1[14], _k12, vl); - _sum7 = vfmacc_vf_f16m1(_sum7, r1[16], _k12, vl); - - _sum0 = vfmacc_vf_f16m1(_sum0, r2[0], _k20, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r2[2], _k20, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r2[4], _k20, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r2[6], _k20, vl); - _sum4 = vfmacc_vf_f16m1(_sum4, r2[8], _k20, vl); - _sum5 = vfmacc_vf_f16m1(_sum5, r2[10], _k20, vl); - _sum6 = vfmacc_vf_f16m1(_sum6, r2[12], _k20, vl); - _sum7 = vfmacc_vf_f16m1(_sum7, r2[14], _k20, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r2[1], _k21, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r2[3], _k21, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r2[5], _k21, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r2[7], _k21, vl); - _sum4 = vfmacc_vf_f16m1(_sum4, r2[9], _k21, vl); - _sum5 = vfmacc_vf_f16m1(_sum5, r2[11], _k21, vl); - _sum6 = vfmacc_vf_f16m1(_sum6, r2[13], _k21, vl); - _sum7 = vfmacc_vf_f16m1(_sum7, r2[15], _k21, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r2[2], _k22, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r2[4], _k22, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r2[6], _k22, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r2[8], _k22, vl); - _sum4 = vfmacc_vf_f16m1(_sum4, r2[10], _k22, vl); - _sum5 = vfmacc_vf_f16m1(_sum5, r2[12], _k22, vl); - _sum6 = vfmacc_vf_f16m1(_sum6, r2[14], _k22, vl); - _sum7 = vfmacc_vf_f16m1(_sum7, r2[16], _k22, vl); - - vse16_v_f16m1(outptr0, _sum0, vl); - vse16_v_f16m1(outptr0 + packn, _sum1, vl); - vse16_v_f16m1(outptr0 + packn * 2, _sum2, vl); - vse16_v_f16m1(outptr0 + packn * 3, _sum3, vl); - vse16_v_f16m1(outptr0 + packn * 4, _sum4, vl); - vse16_v_f16m1(outptr0 + packn * 5, _sum5, vl); - vse16_v_f16m1(outptr0 + packn * 6, _sum6, vl); - vse16_v_f16m1(outptr0 + packn * 7, _sum7, vl); + vfloat16m1_t _sum0 = __riscv_vle16_v_f16m1(outptr0, vl); + vfloat16m1_t _sum1 = __riscv_vle16_v_f16m1(outptr0 + packn, vl); + vfloat16m1_t _sum2 = __riscv_vle16_v_f16m1(outptr0 + packn * 2, vl); + vfloat16m1_t _sum3 = __riscv_vle16_v_f16m1(outptr0 + packn * 3, vl); + vfloat16m1_t _sum4 = __riscv_vle16_v_f16m1(outptr0 + packn * 4, vl); + vfloat16m1_t _sum5 = __riscv_vle16_v_f16m1(outptr0 + packn * 5, vl); + vfloat16m1_t _sum6 = __riscv_vle16_v_f16m1(outptr0 + packn * 6, vl); + vfloat16m1_t _sum7 = __riscv_vle16_v_f16m1(outptr0 + packn * 7, vl); + + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r0[0], _k00, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r0[2], _k00, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r0[4], _k00, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r0[6], _k00, vl); + _sum4 = __riscv_vfmacc_vf_f16m1(_sum4, r0[8], _k00, vl); + _sum5 = __riscv_vfmacc_vf_f16m1(_sum5, r0[10], _k00, vl); + _sum6 = __riscv_vfmacc_vf_f16m1(_sum6, r0[12], _k00, vl); + _sum7 = __riscv_vfmacc_vf_f16m1(_sum7, r0[14], _k00, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r0[1], _k01, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r0[3], _k01, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r0[5], _k01, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r0[7], _k01, vl); + _sum4 = __riscv_vfmacc_vf_f16m1(_sum4, r0[9], _k01, vl); + _sum5 = __riscv_vfmacc_vf_f16m1(_sum5, r0[11], _k01, vl); + _sum6 = __riscv_vfmacc_vf_f16m1(_sum6, r0[13], _k01, vl); + _sum7 = __riscv_vfmacc_vf_f16m1(_sum7, r0[15], _k01, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r0[2], _k02, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r0[4], _k02, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r0[6], _k02, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r0[8], _k02, vl); + _sum4 = __riscv_vfmacc_vf_f16m1(_sum4, r0[10], _k02, vl); + _sum5 = __riscv_vfmacc_vf_f16m1(_sum5, r0[12], _k02, vl); + _sum6 = __riscv_vfmacc_vf_f16m1(_sum6, r0[14], _k02, vl); + _sum7 = __riscv_vfmacc_vf_f16m1(_sum7, r0[16], _k02, vl); + + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r1[0], _k10, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r1[2], _k10, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r1[4], _k10, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r1[6], _k10, vl); + _sum4 = __riscv_vfmacc_vf_f16m1(_sum4, r1[8], _k10, vl); + _sum5 = __riscv_vfmacc_vf_f16m1(_sum5, r1[10], _k10, vl); + _sum6 = __riscv_vfmacc_vf_f16m1(_sum6, r1[12], _k10, vl); + _sum7 = __riscv_vfmacc_vf_f16m1(_sum7, r1[14], _k10, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r1[1], _k11, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r1[3], _k11, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r1[5], _k11, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r1[7], _k11, vl); + _sum4 = __riscv_vfmacc_vf_f16m1(_sum4, r1[9], _k11, vl); + _sum5 = __riscv_vfmacc_vf_f16m1(_sum5, r1[11], _k11, vl); + _sum6 = __riscv_vfmacc_vf_f16m1(_sum6, r1[13], _k11, vl); + _sum7 = __riscv_vfmacc_vf_f16m1(_sum7, r1[15], _k11, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r1[2], _k12, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r1[4], _k12, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r1[6], _k12, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r1[8], _k12, vl); + _sum4 = __riscv_vfmacc_vf_f16m1(_sum4, r1[10], _k12, vl); + _sum5 = __riscv_vfmacc_vf_f16m1(_sum5, r1[12], _k12, vl); + _sum6 = __riscv_vfmacc_vf_f16m1(_sum6, r1[14], _k12, vl); + _sum7 = __riscv_vfmacc_vf_f16m1(_sum7, r1[16], _k12, vl); + + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r2[0], _k20, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r2[2], _k20, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r2[4], _k20, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r2[6], _k20, vl); + _sum4 = __riscv_vfmacc_vf_f16m1(_sum4, r2[8], _k20, vl); + _sum5 = __riscv_vfmacc_vf_f16m1(_sum5, r2[10], _k20, vl); + _sum6 = __riscv_vfmacc_vf_f16m1(_sum6, r2[12], _k20, vl); + _sum7 = __riscv_vfmacc_vf_f16m1(_sum7, r2[14], _k20, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r2[1], _k21, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r2[3], _k21, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r2[5], _k21, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r2[7], _k21, vl); + _sum4 = __riscv_vfmacc_vf_f16m1(_sum4, r2[9], _k21, vl); + _sum5 = __riscv_vfmacc_vf_f16m1(_sum5, r2[11], _k21, vl); + _sum6 = __riscv_vfmacc_vf_f16m1(_sum6, r2[13], _k21, vl); + _sum7 = __riscv_vfmacc_vf_f16m1(_sum7, r2[15], _k21, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r2[2], _k22, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r2[4], _k22, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r2[6], _k22, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r2[8], _k22, vl); + _sum4 = __riscv_vfmacc_vf_f16m1(_sum4, r2[10], _k22, vl); + _sum5 = __riscv_vfmacc_vf_f16m1(_sum5, r2[12], _k22, vl); + _sum6 = __riscv_vfmacc_vf_f16m1(_sum6, r2[14], _k22, vl); + _sum7 = __riscv_vfmacc_vf_f16m1(_sum7, r2[16], _k22, vl); + + __riscv_vse16_v_f16m1(outptr0, _sum0, vl); + __riscv_vse16_v_f16m1(outptr0 + packn, _sum1, vl); + __riscv_vse16_v_f16m1(outptr0 + packn * 2, _sum2, vl); + __riscv_vse16_v_f16m1(outptr0 + packn * 3, _sum3, vl); + __riscv_vse16_v_f16m1(outptr0 + packn * 4, _sum4, vl); + __riscv_vse16_v_f16m1(outptr0 + packn * 5, _sum5, vl); + __riscv_vse16_v_f16m1(outptr0 + packn * 6, _sum6, vl); + __riscv_vse16_v_f16m1(outptr0 + packn * 7, _sum7, vl); outptr0 += packn * 8; @@ -440,54 +440,54 @@ static void conv3x3s2_pack1ton_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, } for (; j + 3 < outw; j += 4) { - vfloat16m1_t _sum0 = vle16_v_f16m1(outptr0, vl); - vfloat16m1_t _sum1 = vle16_v_f16m1(outptr0 + packn, vl); - vfloat16m1_t _sum2 = vle16_v_f16m1(outptr0 + packn * 2, vl); - vfloat16m1_t _sum3 = vle16_v_f16m1(outptr0 + packn * 3, vl); - - _sum0 = vfmacc_vf_f16m1(_sum0, r0[0], _k00, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r0[2], _k00, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r0[4], _k00, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r0[6], _k00, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r0[1], _k01, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r0[3], _k01, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r0[5], _k01, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r0[7], _k01, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r0[2], _k02, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r0[4], _k02, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r0[6], _k02, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r0[8], _k02, vl); - - _sum0 = vfmacc_vf_f16m1(_sum0, r1[0], _k10, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r1[2], _k10, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r1[4], _k10, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r1[6], _k10, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r1[1], _k11, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r1[3], _k11, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r1[5], _k11, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r1[7], _k11, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r1[2], _k12, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r1[4], _k12, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r1[6], _k12, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r1[8], _k12, vl); - - _sum0 = vfmacc_vf_f16m1(_sum0, r2[0], _k20, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r2[2], _k20, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r2[4], _k20, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r2[6], _k20, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r2[1], _k21, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r2[3], _k21, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r2[5], _k21, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r2[7], _k21, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r2[2], _k22, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r2[4], _k22, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r2[6], _k22, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r2[8], _k22, vl); - - vse16_v_f16m1(outptr0, _sum0, vl); - vse16_v_f16m1(outptr0 + packn, _sum1, vl); - vse16_v_f16m1(outptr0 + packn * 2, _sum2, vl); - vse16_v_f16m1(outptr0 + packn * 3, _sum3, vl); + vfloat16m1_t _sum0 = __riscv_vle16_v_f16m1(outptr0, vl); + vfloat16m1_t _sum1 = __riscv_vle16_v_f16m1(outptr0 + packn, vl); + vfloat16m1_t _sum2 = __riscv_vle16_v_f16m1(outptr0 + packn * 2, vl); + vfloat16m1_t _sum3 = __riscv_vle16_v_f16m1(outptr0 + packn * 3, vl); + + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r0[0], _k00, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r0[2], _k00, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r0[4], _k00, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r0[6], _k00, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r0[1], _k01, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r0[3], _k01, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r0[5], _k01, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r0[7], _k01, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r0[2], _k02, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r0[4], _k02, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r0[6], _k02, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r0[8], _k02, vl); + + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r1[0], _k10, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r1[2], _k10, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r1[4], _k10, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r1[6], _k10, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r1[1], _k11, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r1[3], _k11, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r1[5], _k11, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r1[7], _k11, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r1[2], _k12, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r1[4], _k12, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r1[6], _k12, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r1[8], _k12, vl); + + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r2[0], _k20, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r2[2], _k20, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r2[4], _k20, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r2[6], _k20, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r2[1], _k21, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r2[3], _k21, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r2[5], _k21, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r2[7], _k21, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r2[2], _k22, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r2[4], _k22, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r2[6], _k22, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r2[8], _k22, vl); + + __riscv_vse16_v_f16m1(outptr0, _sum0, vl); + __riscv_vse16_v_f16m1(outptr0 + packn, _sum1, vl); + __riscv_vse16_v_f16m1(outptr0 + packn * 2, _sum2, vl); + __riscv_vse16_v_f16m1(outptr0 + packn * 3, _sum3, vl); outptr0 += packn * 4; @@ -497,32 +497,32 @@ static void conv3x3s2_pack1ton_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, } for (; j + 1 < outw; j += 2) { - vfloat16m1_t _sum0 = vle16_v_f16m1(outptr0, vl); - vfloat16m1_t _sum1 = vle16_v_f16m1(outptr0 + packn, vl); - - _sum0 = vfmacc_vf_f16m1(_sum0, r0[0], _k00, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r0[2], _k00, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r0[1], _k01, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r0[3], _k01, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r0[2], _k02, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r0[4], _k02, vl); - - _sum0 = vfmacc_vf_f16m1(_sum0, r1[0], _k10, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r1[2], _k10, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r1[1], _k11, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r1[3], _k11, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r1[2], _k12, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r1[4], _k12, vl); - - _sum0 = vfmacc_vf_f16m1(_sum0, r2[0], _k20, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r2[2], _k20, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r2[1], _k21, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r2[3], _k21, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r2[2], _k22, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r2[4], _k22, vl); - - vse16_v_f16m1(outptr0, _sum0, vl); - vse16_v_f16m1(outptr0 + packn, _sum1, vl); + vfloat16m1_t _sum0 = __riscv_vle16_v_f16m1(outptr0, vl); + vfloat16m1_t _sum1 = __riscv_vle16_v_f16m1(outptr0 + packn, vl); + + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r0[0], _k00, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r0[2], _k00, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r0[1], _k01, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r0[3], _k01, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r0[2], _k02, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r0[4], _k02, vl); + + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r1[0], _k10, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r1[2], _k10, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r1[1], _k11, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r1[3], _k11, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r1[2], _k12, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r1[4], _k12, vl); + + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r2[0], _k20, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r2[2], _k20, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r2[1], _k21, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r2[3], _k21, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r2[2], _k22, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r2[4], _k22, vl); + + __riscv_vse16_v_f16m1(outptr0, _sum0, vl); + __riscv_vse16_v_f16m1(outptr0 + packn, _sum1, vl); outptr0 += packn * 2; @@ -532,21 +532,21 @@ static void conv3x3s2_pack1ton_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, } for (; j < outw; j++) { - vfloat16m1_t _sum0 = vle16_v_f16m1(outptr0, vl); + vfloat16m1_t _sum0 = __riscv_vle16_v_f16m1(outptr0, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r0[0], _k00, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r0[1], _k01, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r0[2], _k02, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r0[0], _k00, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r0[1], _k01, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r0[2], _k02, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r1[0], _k10, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r1[1], _k11, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r1[2], _k12, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r1[0], _k10, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r1[1], _k11, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r1[2], _k12, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r2[0], _k20, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r2[1], _k21, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r2[2], _k22, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r2[0], _k20, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r2[1], _k21, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r2[2], _k22, vl); - vse16_v_f16m1(outptr0, _sum0, vl); + __riscv_vse16_v_f16m1(outptr0, _sum0, vl); outptr0 += packn; diff --git a/src/layer/riscv/convolution_7x7_pack1ton.h b/src/layer/riscv/convolution_7x7_pack1ton.h index 3605ed027cd..709230a0fdd 100644 --- a/src/layer/riscv/convolution_7x7_pack1ton.h +++ b/src/layer/riscv/convolution_7x7_pack1ton.h @@ -15,7 +15,7 @@ static void conv7x7s2_pack1ton_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) { const int packn = csrr_vlenb() / 4; - const size_t vl = vsetvl_e32m1(packn); + const size_t vl = __riscv_vsetvl_e32m1(packn); int w = bottom_blob.w; int inch = bottom_blob.c; @@ -33,7 +33,7 @@ static void conv7x7s2_pack1ton_rvv(const Mat& bottom_blob, Mat& top_blob, const { Mat out0 = top_blob.channel(p); - vfloat32m1_t _bias0 = bias ? vle32_v_f32m1(bias + p * packn, vl) : vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _bias0 = bias ? __riscv_vle32_v_f32m1(bias + p * packn, vl) : __riscv_vfmv_v_f_f32m1(0.f, vl); out0.fill(_bias0); for (int q = 0; q < inch; q++) @@ -59,492 +59,492 @@ static void conv7x7s2_pack1ton_rvv(const Mat& bottom_blob, Mat& top_blob, const int j = 0; for (; j + 7 < outw; j += 8) { - vfloat32m1_t _sum0 = vle32_v_f32m1(outptr0, vl); - vfloat32m1_t _sum1 = vle32_v_f32m1(outptr0 + packn, vl); - vfloat32m1_t _sum2 = vle32_v_f32m1(outptr0 + packn * 2, vl); - vfloat32m1_t _sum3 = vle32_v_f32m1(outptr0 + packn * 3, vl); - vfloat32m1_t _sum4 = vle32_v_f32m1(outptr0 + packn * 4, vl); - vfloat32m1_t _sum5 = vle32_v_f32m1(outptr0 + packn * 5, vl); - vfloat32m1_t _sum6 = vle32_v_f32m1(outptr0 + packn * 6, vl); - vfloat32m1_t _sum7 = vle32_v_f32m1(outptr0 + packn * 7, vl); - - vfloat32m1_t _k00 = vle32_v_f32m1(kptr, vl); - vfloat32m1_t _k01 = vle32_v_f32m1(kptr + packn, vl); - vfloat32m1_t _k02 = vle32_v_f32m1(kptr + packn * 2, vl); - vfloat32m1_t _k03 = vle32_v_f32m1(kptr + packn * 3, vl); - vfloat32m1_t _k04 = vle32_v_f32m1(kptr + packn * 4, vl); - vfloat32m1_t _k05 = vle32_v_f32m1(kptr + packn * 5, vl); - vfloat32m1_t _k06 = vle32_v_f32m1(kptr + packn * 6, vl); + vfloat32m1_t _sum0 = __riscv_vle32_v_f32m1(outptr0, vl); + vfloat32m1_t _sum1 = __riscv_vle32_v_f32m1(outptr0 + packn, vl); + vfloat32m1_t _sum2 = __riscv_vle32_v_f32m1(outptr0 + packn * 2, vl); + vfloat32m1_t _sum3 = __riscv_vle32_v_f32m1(outptr0 + packn * 3, vl); + vfloat32m1_t _sum4 = __riscv_vle32_v_f32m1(outptr0 + packn * 4, vl); + vfloat32m1_t _sum5 = __riscv_vle32_v_f32m1(outptr0 + packn * 5, vl); + vfloat32m1_t _sum6 = __riscv_vle32_v_f32m1(outptr0 + packn * 6, vl); + vfloat32m1_t _sum7 = __riscv_vle32_v_f32m1(outptr0 + packn * 7, vl); + + vfloat32m1_t _k00 = __riscv_vle32_v_f32m1(kptr, vl); + vfloat32m1_t _k01 = __riscv_vle32_v_f32m1(kptr + packn, vl); + vfloat32m1_t _k02 = __riscv_vle32_v_f32m1(kptr + packn * 2, vl); + vfloat32m1_t _k03 = __riscv_vle32_v_f32m1(kptr + packn * 3, vl); + vfloat32m1_t _k04 = __riscv_vle32_v_f32m1(kptr + packn * 4, vl); + vfloat32m1_t _k05 = __riscv_vle32_v_f32m1(kptr + packn * 5, vl); + vfloat32m1_t _k06 = __riscv_vle32_v_f32m1(kptr + packn * 6, vl); kptr += packn * 7; - _sum0 = vfmacc_vf_f32m1(_sum0, r0[0], _k00, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r0[2], _k00, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r0[4], _k00, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r0[6], _k00, vl); - _sum4 = vfmacc_vf_f32m1(_sum4, r0[8], _k00, vl); - _sum5 = vfmacc_vf_f32m1(_sum5, r0[10], _k00, vl); - _sum6 = vfmacc_vf_f32m1(_sum6, r0[12], _k00, vl); - _sum7 = vfmacc_vf_f32m1(_sum7, r0[14], _k00, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r0[1], _k01, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r0[3], _k01, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r0[5], _k01, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r0[7], _k01, vl); - _sum4 = vfmacc_vf_f32m1(_sum4, r0[9], _k01, vl); - _sum5 = vfmacc_vf_f32m1(_sum5, r0[11], _k01, vl); - _sum6 = vfmacc_vf_f32m1(_sum6, r0[13], _k01, vl); - _sum7 = vfmacc_vf_f32m1(_sum7, r0[15], _k01, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r0[2], _k02, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r0[4], _k02, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r0[6], _k02, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r0[8], _k02, vl); - _sum4 = vfmacc_vf_f32m1(_sum4, r0[10], _k02, vl); - _sum5 = vfmacc_vf_f32m1(_sum5, r0[12], _k02, vl); - _sum6 = vfmacc_vf_f32m1(_sum6, r0[14], _k02, vl); - _sum7 = vfmacc_vf_f32m1(_sum7, r0[16], _k02, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r0[3], _k03, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r0[5], _k03, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r0[7], _k03, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r0[9], _k03, vl); - _sum4 = vfmacc_vf_f32m1(_sum4, r0[11], _k03, vl); - _sum5 = vfmacc_vf_f32m1(_sum5, r0[13], _k03, vl); - _sum6 = vfmacc_vf_f32m1(_sum6, r0[15], _k03, vl); - _sum7 = vfmacc_vf_f32m1(_sum7, r0[17], _k03, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r0[4], _k04, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r0[6], _k04, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r0[8], _k04, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r0[10], _k04, vl); - _sum4 = vfmacc_vf_f32m1(_sum4, r0[12], _k04, vl); - _sum5 = vfmacc_vf_f32m1(_sum5, r0[14], _k04, vl); - _sum6 = vfmacc_vf_f32m1(_sum6, r0[16], _k04, vl); - _sum7 = vfmacc_vf_f32m1(_sum7, r0[18], _k04, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r0[5], _k05, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r0[7], _k05, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r0[9], _k05, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r0[11], _k05, vl); - _sum4 = vfmacc_vf_f32m1(_sum4, r0[13], _k05, vl); - _sum5 = vfmacc_vf_f32m1(_sum5, r0[15], _k05, vl); - _sum6 = vfmacc_vf_f32m1(_sum6, r0[17], _k05, vl); - _sum7 = vfmacc_vf_f32m1(_sum7, r0[19], _k05, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r0[6], _k06, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r0[8], _k06, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r0[10], _k06, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r0[12], _k06, vl); - _sum4 = vfmacc_vf_f32m1(_sum4, r0[14], _k06, vl); - _sum5 = vfmacc_vf_f32m1(_sum5, r0[16], _k06, vl); - _sum6 = vfmacc_vf_f32m1(_sum6, r0[18], _k06, vl); - _sum7 = vfmacc_vf_f32m1(_sum7, r0[20], _k06, vl); - - vfloat32m1_t _k10 = vle32_v_f32m1(kptr, vl); - vfloat32m1_t _k11 = vle32_v_f32m1(kptr + packn, vl); - vfloat32m1_t _k12 = vle32_v_f32m1(kptr + packn * 2, vl); - vfloat32m1_t _k13 = vle32_v_f32m1(kptr + packn * 3, vl); - vfloat32m1_t _k14 = vle32_v_f32m1(kptr + packn * 4, vl); - vfloat32m1_t _k15 = vle32_v_f32m1(kptr + packn * 5, vl); - vfloat32m1_t _k16 = vle32_v_f32m1(kptr + packn * 6, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r0[0], _k00, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r0[2], _k00, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r0[4], _k00, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r0[6], _k00, vl); + _sum4 = __riscv_vfmacc_vf_f32m1(_sum4, r0[8], _k00, vl); + _sum5 = __riscv_vfmacc_vf_f32m1(_sum5, r0[10], _k00, vl); + _sum6 = __riscv_vfmacc_vf_f32m1(_sum6, r0[12], _k00, vl); + _sum7 = __riscv_vfmacc_vf_f32m1(_sum7, r0[14], _k00, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r0[1], _k01, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r0[3], _k01, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r0[5], _k01, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r0[7], _k01, vl); + _sum4 = __riscv_vfmacc_vf_f32m1(_sum4, r0[9], _k01, vl); + _sum5 = __riscv_vfmacc_vf_f32m1(_sum5, r0[11], _k01, vl); + _sum6 = __riscv_vfmacc_vf_f32m1(_sum6, r0[13], _k01, vl); + _sum7 = __riscv_vfmacc_vf_f32m1(_sum7, r0[15], _k01, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r0[2], _k02, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r0[4], _k02, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r0[6], _k02, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r0[8], _k02, vl); + _sum4 = __riscv_vfmacc_vf_f32m1(_sum4, r0[10], _k02, vl); + _sum5 = __riscv_vfmacc_vf_f32m1(_sum5, r0[12], _k02, vl); + _sum6 = __riscv_vfmacc_vf_f32m1(_sum6, r0[14], _k02, vl); + _sum7 = __riscv_vfmacc_vf_f32m1(_sum7, r0[16], _k02, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r0[3], _k03, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r0[5], _k03, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r0[7], _k03, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r0[9], _k03, vl); + _sum4 = __riscv_vfmacc_vf_f32m1(_sum4, r0[11], _k03, vl); + _sum5 = __riscv_vfmacc_vf_f32m1(_sum5, r0[13], _k03, vl); + _sum6 = __riscv_vfmacc_vf_f32m1(_sum6, r0[15], _k03, vl); + _sum7 = __riscv_vfmacc_vf_f32m1(_sum7, r0[17], _k03, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r0[4], _k04, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r0[6], _k04, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r0[8], _k04, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r0[10], _k04, vl); + _sum4 = __riscv_vfmacc_vf_f32m1(_sum4, r0[12], _k04, vl); + _sum5 = __riscv_vfmacc_vf_f32m1(_sum5, r0[14], _k04, vl); + _sum6 = __riscv_vfmacc_vf_f32m1(_sum6, r0[16], _k04, vl); + _sum7 = __riscv_vfmacc_vf_f32m1(_sum7, r0[18], _k04, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r0[5], _k05, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r0[7], _k05, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r0[9], _k05, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r0[11], _k05, vl); + _sum4 = __riscv_vfmacc_vf_f32m1(_sum4, r0[13], _k05, vl); + _sum5 = __riscv_vfmacc_vf_f32m1(_sum5, r0[15], _k05, vl); + _sum6 = __riscv_vfmacc_vf_f32m1(_sum6, r0[17], _k05, vl); + _sum7 = __riscv_vfmacc_vf_f32m1(_sum7, r0[19], _k05, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r0[6], _k06, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r0[8], _k06, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r0[10], _k06, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r0[12], _k06, vl); + _sum4 = __riscv_vfmacc_vf_f32m1(_sum4, r0[14], _k06, vl); + _sum5 = __riscv_vfmacc_vf_f32m1(_sum5, r0[16], _k06, vl); + _sum6 = __riscv_vfmacc_vf_f32m1(_sum6, r0[18], _k06, vl); + _sum7 = __riscv_vfmacc_vf_f32m1(_sum7, r0[20], _k06, vl); + + vfloat32m1_t _k10 = __riscv_vle32_v_f32m1(kptr, vl); + vfloat32m1_t _k11 = __riscv_vle32_v_f32m1(kptr + packn, vl); + vfloat32m1_t _k12 = __riscv_vle32_v_f32m1(kptr + packn * 2, vl); + vfloat32m1_t _k13 = __riscv_vle32_v_f32m1(kptr + packn * 3, vl); + vfloat32m1_t _k14 = __riscv_vle32_v_f32m1(kptr + packn * 4, vl); + vfloat32m1_t _k15 = __riscv_vle32_v_f32m1(kptr + packn * 5, vl); + vfloat32m1_t _k16 = __riscv_vle32_v_f32m1(kptr + packn * 6, vl); kptr += packn * 7; - _sum0 = vfmacc_vf_f32m1(_sum0, r1[0], _k10, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r1[2], _k10, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r1[4], _k10, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r1[6], _k10, vl); - _sum4 = vfmacc_vf_f32m1(_sum4, r1[8], _k10, vl); - _sum5 = vfmacc_vf_f32m1(_sum5, r1[10], _k10, vl); - _sum6 = vfmacc_vf_f32m1(_sum6, r1[12], _k10, vl); - _sum7 = vfmacc_vf_f32m1(_sum7, r1[14], _k10, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r1[1], _k11, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r1[3], _k11, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r1[5], _k11, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r1[7], _k11, vl); - _sum4 = vfmacc_vf_f32m1(_sum4, r1[9], _k11, vl); - _sum5 = vfmacc_vf_f32m1(_sum5, r1[11], _k11, vl); - _sum6 = vfmacc_vf_f32m1(_sum6, r1[13], _k11, vl); - _sum7 = vfmacc_vf_f32m1(_sum7, r1[15], _k11, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r1[2], _k12, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r1[4], _k12, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r1[6], _k12, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r1[8], _k12, vl); - _sum4 = vfmacc_vf_f32m1(_sum4, r1[10], _k12, vl); - _sum5 = vfmacc_vf_f32m1(_sum5, r1[12], _k12, vl); - _sum6 = vfmacc_vf_f32m1(_sum6, r1[14], _k12, vl); - _sum7 = vfmacc_vf_f32m1(_sum7, r1[16], _k12, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r1[3], _k13, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r1[5], _k13, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r1[7], _k13, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r1[9], _k13, vl); - _sum4 = vfmacc_vf_f32m1(_sum4, r1[11], _k13, vl); - _sum5 = vfmacc_vf_f32m1(_sum5, r1[13], _k13, vl); - _sum6 = vfmacc_vf_f32m1(_sum6, r1[15], _k13, vl); - _sum7 = vfmacc_vf_f32m1(_sum7, r1[17], _k13, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r1[4], _k14, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r1[6], _k14, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r1[8], _k14, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r1[10], _k14, vl); - _sum4 = vfmacc_vf_f32m1(_sum4, r1[12], _k14, vl); - _sum5 = vfmacc_vf_f32m1(_sum5, r1[14], _k14, vl); - _sum6 = vfmacc_vf_f32m1(_sum6, r1[16], _k14, vl); - _sum7 = vfmacc_vf_f32m1(_sum7, r1[18], _k14, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r1[5], _k15, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r1[7], _k15, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r1[9], _k15, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r1[11], _k15, vl); - _sum4 = vfmacc_vf_f32m1(_sum4, r1[13], _k15, vl); - _sum5 = vfmacc_vf_f32m1(_sum5, r1[15], _k15, vl); - _sum6 = vfmacc_vf_f32m1(_sum6, r1[17], _k15, vl); - _sum7 = vfmacc_vf_f32m1(_sum7, r1[19], _k15, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r1[6], _k16, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r1[8], _k16, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r1[10], _k16, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r1[12], _k16, vl); - _sum4 = vfmacc_vf_f32m1(_sum4, r1[14], _k16, vl); - _sum5 = vfmacc_vf_f32m1(_sum5, r1[16], _k16, vl); - _sum6 = vfmacc_vf_f32m1(_sum6, r1[18], _k16, vl); - _sum7 = vfmacc_vf_f32m1(_sum7, r1[20], _k16, vl); - - vfloat32m1_t _k20 = vle32_v_f32m1(kptr, vl); - vfloat32m1_t _k21 = vle32_v_f32m1(kptr + packn, vl); - vfloat32m1_t _k22 = vle32_v_f32m1(kptr + packn * 2, vl); - vfloat32m1_t _k23 = vle32_v_f32m1(kptr + packn * 3, vl); - vfloat32m1_t _k24 = vle32_v_f32m1(kptr + packn * 4, vl); - vfloat32m1_t _k25 = vle32_v_f32m1(kptr + packn * 5, vl); - vfloat32m1_t _k26 = vle32_v_f32m1(kptr + packn * 6, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r1[0], _k10, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r1[2], _k10, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r1[4], _k10, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r1[6], _k10, vl); + _sum4 = __riscv_vfmacc_vf_f32m1(_sum4, r1[8], _k10, vl); + _sum5 = __riscv_vfmacc_vf_f32m1(_sum5, r1[10], _k10, vl); + _sum6 = __riscv_vfmacc_vf_f32m1(_sum6, r1[12], _k10, vl); + _sum7 = __riscv_vfmacc_vf_f32m1(_sum7, r1[14], _k10, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r1[1], _k11, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r1[3], _k11, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r1[5], _k11, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r1[7], _k11, vl); + _sum4 = __riscv_vfmacc_vf_f32m1(_sum4, r1[9], _k11, vl); + _sum5 = __riscv_vfmacc_vf_f32m1(_sum5, r1[11], _k11, vl); + _sum6 = __riscv_vfmacc_vf_f32m1(_sum6, r1[13], _k11, vl); + _sum7 = __riscv_vfmacc_vf_f32m1(_sum7, r1[15], _k11, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r1[2], _k12, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r1[4], _k12, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r1[6], _k12, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r1[8], _k12, vl); + _sum4 = __riscv_vfmacc_vf_f32m1(_sum4, r1[10], _k12, vl); + _sum5 = __riscv_vfmacc_vf_f32m1(_sum5, r1[12], _k12, vl); + _sum6 = __riscv_vfmacc_vf_f32m1(_sum6, r1[14], _k12, vl); + _sum7 = __riscv_vfmacc_vf_f32m1(_sum7, r1[16], _k12, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r1[3], _k13, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r1[5], _k13, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r1[7], _k13, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r1[9], _k13, vl); + _sum4 = __riscv_vfmacc_vf_f32m1(_sum4, r1[11], _k13, vl); + _sum5 = __riscv_vfmacc_vf_f32m1(_sum5, r1[13], _k13, vl); + _sum6 = __riscv_vfmacc_vf_f32m1(_sum6, r1[15], _k13, vl); + _sum7 = __riscv_vfmacc_vf_f32m1(_sum7, r1[17], _k13, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r1[4], _k14, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r1[6], _k14, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r1[8], _k14, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r1[10], _k14, vl); + _sum4 = __riscv_vfmacc_vf_f32m1(_sum4, r1[12], _k14, vl); + _sum5 = __riscv_vfmacc_vf_f32m1(_sum5, r1[14], _k14, vl); + _sum6 = __riscv_vfmacc_vf_f32m1(_sum6, r1[16], _k14, vl); + _sum7 = __riscv_vfmacc_vf_f32m1(_sum7, r1[18], _k14, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r1[5], _k15, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r1[7], _k15, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r1[9], _k15, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r1[11], _k15, vl); + _sum4 = __riscv_vfmacc_vf_f32m1(_sum4, r1[13], _k15, vl); + _sum5 = __riscv_vfmacc_vf_f32m1(_sum5, r1[15], _k15, vl); + _sum6 = __riscv_vfmacc_vf_f32m1(_sum6, r1[17], _k15, vl); + _sum7 = __riscv_vfmacc_vf_f32m1(_sum7, r1[19], _k15, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r1[6], _k16, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r1[8], _k16, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r1[10], _k16, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r1[12], _k16, vl); + _sum4 = __riscv_vfmacc_vf_f32m1(_sum4, r1[14], _k16, vl); + _sum5 = __riscv_vfmacc_vf_f32m1(_sum5, r1[16], _k16, vl); + _sum6 = __riscv_vfmacc_vf_f32m1(_sum6, r1[18], _k16, vl); + _sum7 = __riscv_vfmacc_vf_f32m1(_sum7, r1[20], _k16, vl); + + vfloat32m1_t _k20 = __riscv_vle32_v_f32m1(kptr, vl); + vfloat32m1_t _k21 = __riscv_vle32_v_f32m1(kptr + packn, vl); + vfloat32m1_t _k22 = __riscv_vle32_v_f32m1(kptr + packn * 2, vl); + vfloat32m1_t _k23 = __riscv_vle32_v_f32m1(kptr + packn * 3, vl); + vfloat32m1_t _k24 = __riscv_vle32_v_f32m1(kptr + packn * 4, vl); + vfloat32m1_t _k25 = __riscv_vle32_v_f32m1(kptr + packn * 5, vl); + vfloat32m1_t _k26 = __riscv_vle32_v_f32m1(kptr + packn * 6, vl); kptr += packn * 7; - _sum0 = vfmacc_vf_f32m1(_sum0, r2[0], _k20, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r2[2], _k20, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r2[4], _k20, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r2[6], _k20, vl); - _sum4 = vfmacc_vf_f32m1(_sum4, r2[8], _k20, vl); - _sum5 = vfmacc_vf_f32m1(_sum5, r2[10], _k20, vl); - _sum6 = vfmacc_vf_f32m1(_sum6, r2[12], _k20, vl); - _sum7 = vfmacc_vf_f32m1(_sum7, r2[14], _k20, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r2[1], _k21, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r2[3], _k21, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r2[5], _k21, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r2[7], _k21, vl); - _sum4 = vfmacc_vf_f32m1(_sum4, r2[9], _k21, vl); - _sum5 = vfmacc_vf_f32m1(_sum5, r2[11], _k21, vl); - _sum6 = vfmacc_vf_f32m1(_sum6, r2[13], _k21, vl); - _sum7 = vfmacc_vf_f32m1(_sum7, r2[15], _k21, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r2[2], _k22, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r2[4], _k22, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r2[6], _k22, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r2[8], _k22, vl); - _sum4 = vfmacc_vf_f32m1(_sum4, r2[10], _k22, vl); - _sum5 = vfmacc_vf_f32m1(_sum5, r2[12], _k22, vl); - _sum6 = vfmacc_vf_f32m1(_sum6, r2[14], _k22, vl); - _sum7 = vfmacc_vf_f32m1(_sum7, r2[16], _k22, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r2[3], _k23, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r2[5], _k23, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r2[7], _k23, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r2[9], _k23, vl); - _sum4 = vfmacc_vf_f32m1(_sum4, r2[11], _k23, vl); - _sum5 = vfmacc_vf_f32m1(_sum5, r2[13], _k23, vl); - _sum6 = vfmacc_vf_f32m1(_sum6, r2[15], _k23, vl); - _sum7 = vfmacc_vf_f32m1(_sum7, r2[17], _k23, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r2[4], _k24, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r2[6], _k24, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r2[8], _k24, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r2[10], _k24, vl); - _sum4 = vfmacc_vf_f32m1(_sum4, r2[12], _k24, vl); - _sum5 = vfmacc_vf_f32m1(_sum5, r2[14], _k24, vl); - _sum6 = vfmacc_vf_f32m1(_sum6, r2[16], _k24, vl); - _sum7 = vfmacc_vf_f32m1(_sum7, r2[18], _k24, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r2[5], _k25, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r2[7], _k25, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r2[9], _k25, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r2[11], _k25, vl); - _sum4 = vfmacc_vf_f32m1(_sum4, r2[13], _k25, vl); - _sum5 = vfmacc_vf_f32m1(_sum5, r2[15], _k25, vl); - _sum6 = vfmacc_vf_f32m1(_sum6, r2[17], _k25, vl); - _sum7 = vfmacc_vf_f32m1(_sum7, r2[19], _k25, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r2[6], _k26, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r2[8], _k26, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r2[10], _k26, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r2[12], _k26, vl); - _sum4 = vfmacc_vf_f32m1(_sum4, r2[14], _k26, vl); - _sum5 = vfmacc_vf_f32m1(_sum5, r2[16], _k26, vl); - _sum6 = vfmacc_vf_f32m1(_sum6, r2[18], _k26, vl); - _sum7 = vfmacc_vf_f32m1(_sum7, r2[20], _k26, vl); - - vfloat32m1_t _k30 = vle32_v_f32m1(kptr, vl); - vfloat32m1_t _k31 = vle32_v_f32m1(kptr + packn, vl); - vfloat32m1_t _k32 = vle32_v_f32m1(kptr + packn * 2, vl); - vfloat32m1_t _k33 = vle32_v_f32m1(kptr + packn * 3, vl); - vfloat32m1_t _k34 = vle32_v_f32m1(kptr + packn * 4, vl); - vfloat32m1_t _k35 = vle32_v_f32m1(kptr + packn * 5, vl); - vfloat32m1_t _k36 = vle32_v_f32m1(kptr + packn * 6, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r2[0], _k20, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r2[2], _k20, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r2[4], _k20, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r2[6], _k20, vl); + _sum4 = __riscv_vfmacc_vf_f32m1(_sum4, r2[8], _k20, vl); + _sum5 = __riscv_vfmacc_vf_f32m1(_sum5, r2[10], _k20, vl); + _sum6 = __riscv_vfmacc_vf_f32m1(_sum6, r2[12], _k20, vl); + _sum7 = __riscv_vfmacc_vf_f32m1(_sum7, r2[14], _k20, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r2[1], _k21, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r2[3], _k21, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r2[5], _k21, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r2[7], _k21, vl); + _sum4 = __riscv_vfmacc_vf_f32m1(_sum4, r2[9], _k21, vl); + _sum5 = __riscv_vfmacc_vf_f32m1(_sum5, r2[11], _k21, vl); + _sum6 = __riscv_vfmacc_vf_f32m1(_sum6, r2[13], _k21, vl); + _sum7 = __riscv_vfmacc_vf_f32m1(_sum7, r2[15], _k21, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r2[2], _k22, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r2[4], _k22, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r2[6], _k22, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r2[8], _k22, vl); + _sum4 = __riscv_vfmacc_vf_f32m1(_sum4, r2[10], _k22, vl); + _sum5 = __riscv_vfmacc_vf_f32m1(_sum5, r2[12], _k22, vl); + _sum6 = __riscv_vfmacc_vf_f32m1(_sum6, r2[14], _k22, vl); + _sum7 = __riscv_vfmacc_vf_f32m1(_sum7, r2[16], _k22, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r2[3], _k23, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r2[5], _k23, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r2[7], _k23, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r2[9], _k23, vl); + _sum4 = __riscv_vfmacc_vf_f32m1(_sum4, r2[11], _k23, vl); + _sum5 = __riscv_vfmacc_vf_f32m1(_sum5, r2[13], _k23, vl); + _sum6 = __riscv_vfmacc_vf_f32m1(_sum6, r2[15], _k23, vl); + _sum7 = __riscv_vfmacc_vf_f32m1(_sum7, r2[17], _k23, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r2[4], _k24, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r2[6], _k24, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r2[8], _k24, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r2[10], _k24, vl); + _sum4 = __riscv_vfmacc_vf_f32m1(_sum4, r2[12], _k24, vl); + _sum5 = __riscv_vfmacc_vf_f32m1(_sum5, r2[14], _k24, vl); + _sum6 = __riscv_vfmacc_vf_f32m1(_sum6, r2[16], _k24, vl); + _sum7 = __riscv_vfmacc_vf_f32m1(_sum7, r2[18], _k24, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r2[5], _k25, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r2[7], _k25, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r2[9], _k25, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r2[11], _k25, vl); + _sum4 = __riscv_vfmacc_vf_f32m1(_sum4, r2[13], _k25, vl); + _sum5 = __riscv_vfmacc_vf_f32m1(_sum5, r2[15], _k25, vl); + _sum6 = __riscv_vfmacc_vf_f32m1(_sum6, r2[17], _k25, vl); + _sum7 = __riscv_vfmacc_vf_f32m1(_sum7, r2[19], _k25, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r2[6], _k26, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r2[8], _k26, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r2[10], _k26, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r2[12], _k26, vl); + _sum4 = __riscv_vfmacc_vf_f32m1(_sum4, r2[14], _k26, vl); + _sum5 = __riscv_vfmacc_vf_f32m1(_sum5, r2[16], _k26, vl); + _sum6 = __riscv_vfmacc_vf_f32m1(_sum6, r2[18], _k26, vl); + _sum7 = __riscv_vfmacc_vf_f32m1(_sum7, r2[20], _k26, vl); + + vfloat32m1_t _k30 = __riscv_vle32_v_f32m1(kptr, vl); + vfloat32m1_t _k31 = __riscv_vle32_v_f32m1(kptr + packn, vl); + vfloat32m1_t _k32 = __riscv_vle32_v_f32m1(kptr + packn * 2, vl); + vfloat32m1_t _k33 = __riscv_vle32_v_f32m1(kptr + packn * 3, vl); + vfloat32m1_t _k34 = __riscv_vle32_v_f32m1(kptr + packn * 4, vl); + vfloat32m1_t _k35 = __riscv_vle32_v_f32m1(kptr + packn * 5, vl); + vfloat32m1_t _k36 = __riscv_vle32_v_f32m1(kptr + packn * 6, vl); kptr += packn * 7; - _sum0 = vfmacc_vf_f32m1(_sum0, r3[0], _k30, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r3[2], _k30, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r3[4], _k30, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r3[6], _k30, vl); - _sum4 = vfmacc_vf_f32m1(_sum4, r3[8], _k30, vl); - _sum5 = vfmacc_vf_f32m1(_sum5, r3[10], _k30, vl); - _sum6 = vfmacc_vf_f32m1(_sum6, r3[12], _k30, vl); - _sum7 = vfmacc_vf_f32m1(_sum7, r3[14], _k30, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r3[1], _k31, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r3[3], _k31, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r3[5], _k31, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r3[7], _k31, vl); - _sum4 = vfmacc_vf_f32m1(_sum4, r3[9], _k31, vl); - _sum5 = vfmacc_vf_f32m1(_sum5, r3[11], _k31, vl); - _sum6 = vfmacc_vf_f32m1(_sum6, r3[13], _k31, vl); - _sum7 = vfmacc_vf_f32m1(_sum7, r3[15], _k31, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r3[2], _k32, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r3[4], _k32, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r3[6], _k32, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r3[8], _k32, vl); - _sum4 = vfmacc_vf_f32m1(_sum4, r3[10], _k32, vl); - _sum5 = vfmacc_vf_f32m1(_sum5, r3[12], _k32, vl); - _sum6 = vfmacc_vf_f32m1(_sum6, r3[14], _k32, vl); - _sum7 = vfmacc_vf_f32m1(_sum7, r3[16], _k32, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r3[3], _k33, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r3[5], _k33, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r3[7], _k33, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r3[9], _k33, vl); - _sum4 = vfmacc_vf_f32m1(_sum4, r3[11], _k33, vl); - _sum5 = vfmacc_vf_f32m1(_sum5, r3[13], _k33, vl); - _sum6 = vfmacc_vf_f32m1(_sum6, r3[15], _k33, vl); - _sum7 = vfmacc_vf_f32m1(_sum7, r3[17], _k33, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r3[4], _k34, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r3[6], _k34, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r3[8], _k34, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r3[10], _k34, vl); - _sum4 = vfmacc_vf_f32m1(_sum4, r3[12], _k34, vl); - _sum5 = vfmacc_vf_f32m1(_sum5, r3[14], _k34, vl); - _sum6 = vfmacc_vf_f32m1(_sum6, r3[16], _k34, vl); - _sum7 = vfmacc_vf_f32m1(_sum7, r3[18], _k34, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r3[5], _k35, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r3[7], _k35, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r3[9], _k35, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r3[11], _k35, vl); - _sum4 = vfmacc_vf_f32m1(_sum4, r3[13], _k35, vl); - _sum5 = vfmacc_vf_f32m1(_sum5, r3[15], _k35, vl); - _sum6 = vfmacc_vf_f32m1(_sum6, r3[17], _k35, vl); - _sum7 = vfmacc_vf_f32m1(_sum7, r3[19], _k35, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r3[6], _k36, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r3[8], _k36, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r3[10], _k36, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r3[12], _k36, vl); - _sum4 = vfmacc_vf_f32m1(_sum4, r3[14], _k36, vl); - _sum5 = vfmacc_vf_f32m1(_sum5, r3[16], _k36, vl); - _sum6 = vfmacc_vf_f32m1(_sum6, r3[18], _k36, vl); - _sum7 = vfmacc_vf_f32m1(_sum7, r3[20], _k36, vl); - - vfloat32m1_t _k40 = vle32_v_f32m1(kptr, vl); - vfloat32m1_t _k41 = vle32_v_f32m1(kptr + packn, vl); - vfloat32m1_t _k42 = vle32_v_f32m1(kptr + packn * 2, vl); - vfloat32m1_t _k43 = vle32_v_f32m1(kptr + packn * 3, vl); - vfloat32m1_t _k44 = vle32_v_f32m1(kptr + packn * 4, vl); - vfloat32m1_t _k45 = vle32_v_f32m1(kptr + packn * 5, vl); - vfloat32m1_t _k46 = vle32_v_f32m1(kptr + packn * 6, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r3[0], _k30, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r3[2], _k30, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r3[4], _k30, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r3[6], _k30, vl); + _sum4 = __riscv_vfmacc_vf_f32m1(_sum4, r3[8], _k30, vl); + _sum5 = __riscv_vfmacc_vf_f32m1(_sum5, r3[10], _k30, vl); + _sum6 = __riscv_vfmacc_vf_f32m1(_sum6, r3[12], _k30, vl); + _sum7 = __riscv_vfmacc_vf_f32m1(_sum7, r3[14], _k30, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r3[1], _k31, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r3[3], _k31, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r3[5], _k31, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r3[7], _k31, vl); + _sum4 = __riscv_vfmacc_vf_f32m1(_sum4, r3[9], _k31, vl); + _sum5 = __riscv_vfmacc_vf_f32m1(_sum5, r3[11], _k31, vl); + _sum6 = __riscv_vfmacc_vf_f32m1(_sum6, r3[13], _k31, vl); + _sum7 = __riscv_vfmacc_vf_f32m1(_sum7, r3[15], _k31, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r3[2], _k32, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r3[4], _k32, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r3[6], _k32, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r3[8], _k32, vl); + _sum4 = __riscv_vfmacc_vf_f32m1(_sum4, r3[10], _k32, vl); + _sum5 = __riscv_vfmacc_vf_f32m1(_sum5, r3[12], _k32, vl); + _sum6 = __riscv_vfmacc_vf_f32m1(_sum6, r3[14], _k32, vl); + _sum7 = __riscv_vfmacc_vf_f32m1(_sum7, r3[16], _k32, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r3[3], _k33, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r3[5], _k33, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r3[7], _k33, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r3[9], _k33, vl); + _sum4 = __riscv_vfmacc_vf_f32m1(_sum4, r3[11], _k33, vl); + _sum5 = __riscv_vfmacc_vf_f32m1(_sum5, r3[13], _k33, vl); + _sum6 = __riscv_vfmacc_vf_f32m1(_sum6, r3[15], _k33, vl); + _sum7 = __riscv_vfmacc_vf_f32m1(_sum7, r3[17], _k33, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r3[4], _k34, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r3[6], _k34, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r3[8], _k34, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r3[10], _k34, vl); + _sum4 = __riscv_vfmacc_vf_f32m1(_sum4, r3[12], _k34, vl); + _sum5 = __riscv_vfmacc_vf_f32m1(_sum5, r3[14], _k34, vl); + _sum6 = __riscv_vfmacc_vf_f32m1(_sum6, r3[16], _k34, vl); + _sum7 = __riscv_vfmacc_vf_f32m1(_sum7, r3[18], _k34, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r3[5], _k35, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r3[7], _k35, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r3[9], _k35, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r3[11], _k35, vl); + _sum4 = __riscv_vfmacc_vf_f32m1(_sum4, r3[13], _k35, vl); + _sum5 = __riscv_vfmacc_vf_f32m1(_sum5, r3[15], _k35, vl); + _sum6 = __riscv_vfmacc_vf_f32m1(_sum6, r3[17], _k35, vl); + _sum7 = __riscv_vfmacc_vf_f32m1(_sum7, r3[19], _k35, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r3[6], _k36, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r3[8], _k36, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r3[10], _k36, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r3[12], _k36, vl); + _sum4 = __riscv_vfmacc_vf_f32m1(_sum4, r3[14], _k36, vl); + _sum5 = __riscv_vfmacc_vf_f32m1(_sum5, r3[16], _k36, vl); + _sum6 = __riscv_vfmacc_vf_f32m1(_sum6, r3[18], _k36, vl); + _sum7 = __riscv_vfmacc_vf_f32m1(_sum7, r3[20], _k36, vl); + + vfloat32m1_t _k40 = __riscv_vle32_v_f32m1(kptr, vl); + vfloat32m1_t _k41 = __riscv_vle32_v_f32m1(kptr + packn, vl); + vfloat32m1_t _k42 = __riscv_vle32_v_f32m1(kptr + packn * 2, vl); + vfloat32m1_t _k43 = __riscv_vle32_v_f32m1(kptr + packn * 3, vl); + vfloat32m1_t _k44 = __riscv_vle32_v_f32m1(kptr + packn * 4, vl); + vfloat32m1_t _k45 = __riscv_vle32_v_f32m1(kptr + packn * 5, vl); + vfloat32m1_t _k46 = __riscv_vle32_v_f32m1(kptr + packn * 6, vl); kptr += packn * 7; - _sum0 = vfmacc_vf_f32m1(_sum0, r4[0], _k40, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r4[2], _k40, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r4[4], _k40, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r4[6], _k40, vl); - _sum4 = vfmacc_vf_f32m1(_sum4, r4[8], _k40, vl); - _sum5 = vfmacc_vf_f32m1(_sum5, r4[10], _k40, vl); - _sum6 = vfmacc_vf_f32m1(_sum6, r4[12], _k40, vl); - _sum7 = vfmacc_vf_f32m1(_sum7, r4[14], _k40, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r4[1], _k41, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r4[3], _k41, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r4[5], _k41, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r4[7], _k41, vl); - _sum4 = vfmacc_vf_f32m1(_sum4, r4[9], _k41, vl); - _sum5 = vfmacc_vf_f32m1(_sum5, r4[11], _k41, vl); - _sum6 = vfmacc_vf_f32m1(_sum6, r4[13], _k41, vl); - _sum7 = vfmacc_vf_f32m1(_sum7, r4[15], _k41, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r4[2], _k42, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r4[4], _k42, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r4[6], _k42, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r4[8], _k42, vl); - _sum4 = vfmacc_vf_f32m1(_sum4, r4[10], _k42, vl); - _sum5 = vfmacc_vf_f32m1(_sum5, r4[12], _k42, vl); - _sum6 = vfmacc_vf_f32m1(_sum6, r4[14], _k42, vl); - _sum7 = vfmacc_vf_f32m1(_sum7, r4[16], _k42, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r4[3], _k43, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r4[5], _k43, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r4[7], _k43, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r4[9], _k43, vl); - _sum4 = vfmacc_vf_f32m1(_sum4, r4[11], _k43, vl); - _sum5 = vfmacc_vf_f32m1(_sum5, r4[13], _k43, vl); - _sum6 = vfmacc_vf_f32m1(_sum6, r4[15], _k43, vl); - _sum7 = vfmacc_vf_f32m1(_sum7, r4[17], _k43, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r4[4], _k44, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r4[6], _k44, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r4[8], _k44, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r4[10], _k44, vl); - _sum4 = vfmacc_vf_f32m1(_sum4, r4[12], _k44, vl); - _sum5 = vfmacc_vf_f32m1(_sum5, r4[14], _k44, vl); - _sum6 = vfmacc_vf_f32m1(_sum6, r4[16], _k44, vl); - _sum7 = vfmacc_vf_f32m1(_sum7, r4[18], _k44, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r4[5], _k45, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r4[7], _k45, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r4[9], _k45, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r4[11], _k45, vl); - _sum4 = vfmacc_vf_f32m1(_sum4, r4[13], _k45, vl); - _sum5 = vfmacc_vf_f32m1(_sum5, r4[15], _k45, vl); - _sum6 = vfmacc_vf_f32m1(_sum6, r4[17], _k45, vl); - _sum7 = vfmacc_vf_f32m1(_sum7, r4[19], _k45, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r4[6], _k46, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r4[8], _k46, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r4[10], _k46, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r4[12], _k46, vl); - _sum4 = vfmacc_vf_f32m1(_sum4, r4[14], _k46, vl); - _sum5 = vfmacc_vf_f32m1(_sum5, r4[16], _k46, vl); - _sum6 = vfmacc_vf_f32m1(_sum6, r4[18], _k46, vl); - _sum7 = vfmacc_vf_f32m1(_sum7, r4[20], _k46, vl); - - vfloat32m1_t _k50 = vle32_v_f32m1(kptr, vl); - vfloat32m1_t _k51 = vle32_v_f32m1(kptr + packn, vl); - vfloat32m1_t _k52 = vle32_v_f32m1(kptr + packn * 2, vl); - vfloat32m1_t _k53 = vle32_v_f32m1(kptr + packn * 3, vl); - vfloat32m1_t _k54 = vle32_v_f32m1(kptr + packn * 4, vl); - vfloat32m1_t _k55 = vle32_v_f32m1(kptr + packn * 5, vl); - vfloat32m1_t _k56 = vle32_v_f32m1(kptr + packn * 6, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r4[0], _k40, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r4[2], _k40, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r4[4], _k40, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r4[6], _k40, vl); + _sum4 = __riscv_vfmacc_vf_f32m1(_sum4, r4[8], _k40, vl); + _sum5 = __riscv_vfmacc_vf_f32m1(_sum5, r4[10], _k40, vl); + _sum6 = __riscv_vfmacc_vf_f32m1(_sum6, r4[12], _k40, vl); + _sum7 = __riscv_vfmacc_vf_f32m1(_sum7, r4[14], _k40, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r4[1], _k41, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r4[3], _k41, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r4[5], _k41, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r4[7], _k41, vl); + _sum4 = __riscv_vfmacc_vf_f32m1(_sum4, r4[9], _k41, vl); + _sum5 = __riscv_vfmacc_vf_f32m1(_sum5, r4[11], _k41, vl); + _sum6 = __riscv_vfmacc_vf_f32m1(_sum6, r4[13], _k41, vl); + _sum7 = __riscv_vfmacc_vf_f32m1(_sum7, r4[15], _k41, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r4[2], _k42, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r4[4], _k42, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r4[6], _k42, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r4[8], _k42, vl); + _sum4 = __riscv_vfmacc_vf_f32m1(_sum4, r4[10], _k42, vl); + _sum5 = __riscv_vfmacc_vf_f32m1(_sum5, r4[12], _k42, vl); + _sum6 = __riscv_vfmacc_vf_f32m1(_sum6, r4[14], _k42, vl); + _sum7 = __riscv_vfmacc_vf_f32m1(_sum7, r4[16], _k42, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r4[3], _k43, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r4[5], _k43, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r4[7], _k43, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r4[9], _k43, vl); + _sum4 = __riscv_vfmacc_vf_f32m1(_sum4, r4[11], _k43, vl); + _sum5 = __riscv_vfmacc_vf_f32m1(_sum5, r4[13], _k43, vl); + _sum6 = __riscv_vfmacc_vf_f32m1(_sum6, r4[15], _k43, vl); + _sum7 = __riscv_vfmacc_vf_f32m1(_sum7, r4[17], _k43, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r4[4], _k44, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r4[6], _k44, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r4[8], _k44, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r4[10], _k44, vl); + _sum4 = __riscv_vfmacc_vf_f32m1(_sum4, r4[12], _k44, vl); + _sum5 = __riscv_vfmacc_vf_f32m1(_sum5, r4[14], _k44, vl); + _sum6 = __riscv_vfmacc_vf_f32m1(_sum6, r4[16], _k44, vl); + _sum7 = __riscv_vfmacc_vf_f32m1(_sum7, r4[18], _k44, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r4[5], _k45, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r4[7], _k45, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r4[9], _k45, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r4[11], _k45, vl); + _sum4 = __riscv_vfmacc_vf_f32m1(_sum4, r4[13], _k45, vl); + _sum5 = __riscv_vfmacc_vf_f32m1(_sum5, r4[15], _k45, vl); + _sum6 = __riscv_vfmacc_vf_f32m1(_sum6, r4[17], _k45, vl); + _sum7 = __riscv_vfmacc_vf_f32m1(_sum7, r4[19], _k45, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r4[6], _k46, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r4[8], _k46, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r4[10], _k46, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r4[12], _k46, vl); + _sum4 = __riscv_vfmacc_vf_f32m1(_sum4, r4[14], _k46, vl); + _sum5 = __riscv_vfmacc_vf_f32m1(_sum5, r4[16], _k46, vl); + _sum6 = __riscv_vfmacc_vf_f32m1(_sum6, r4[18], _k46, vl); + _sum7 = __riscv_vfmacc_vf_f32m1(_sum7, r4[20], _k46, vl); + + vfloat32m1_t _k50 = __riscv_vle32_v_f32m1(kptr, vl); + vfloat32m1_t _k51 = __riscv_vle32_v_f32m1(kptr + packn, vl); + vfloat32m1_t _k52 = __riscv_vle32_v_f32m1(kptr + packn * 2, vl); + vfloat32m1_t _k53 = __riscv_vle32_v_f32m1(kptr + packn * 3, vl); + vfloat32m1_t _k54 = __riscv_vle32_v_f32m1(kptr + packn * 4, vl); + vfloat32m1_t _k55 = __riscv_vle32_v_f32m1(kptr + packn * 5, vl); + vfloat32m1_t _k56 = __riscv_vle32_v_f32m1(kptr + packn * 6, vl); kptr += packn * 7; - _sum0 = vfmacc_vf_f32m1(_sum0, r5[0], _k50, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r5[2], _k50, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r5[4], _k50, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r5[6], _k50, vl); - _sum4 = vfmacc_vf_f32m1(_sum4, r5[8], _k50, vl); - _sum5 = vfmacc_vf_f32m1(_sum5, r5[10], _k50, vl); - _sum6 = vfmacc_vf_f32m1(_sum6, r5[12], _k50, vl); - _sum7 = vfmacc_vf_f32m1(_sum7, r5[14], _k50, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r5[1], _k51, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r5[3], _k51, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r5[5], _k51, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r5[7], _k51, vl); - _sum4 = vfmacc_vf_f32m1(_sum4, r5[9], _k51, vl); - _sum5 = vfmacc_vf_f32m1(_sum5, r5[11], _k51, vl); - _sum6 = vfmacc_vf_f32m1(_sum6, r5[13], _k51, vl); - _sum7 = vfmacc_vf_f32m1(_sum7, r5[15], _k51, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r5[2], _k52, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r5[4], _k52, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r5[6], _k52, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r5[8], _k52, vl); - _sum4 = vfmacc_vf_f32m1(_sum4, r5[10], _k52, vl); - _sum5 = vfmacc_vf_f32m1(_sum5, r5[12], _k52, vl); - _sum6 = vfmacc_vf_f32m1(_sum6, r5[14], _k52, vl); - _sum7 = vfmacc_vf_f32m1(_sum7, r5[16], _k52, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r5[3], _k53, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r5[5], _k53, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r5[7], _k53, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r5[9], _k53, vl); - _sum4 = vfmacc_vf_f32m1(_sum4, r5[11], _k53, vl); - _sum5 = vfmacc_vf_f32m1(_sum5, r5[13], _k53, vl); - _sum6 = vfmacc_vf_f32m1(_sum6, r5[15], _k53, vl); - _sum7 = vfmacc_vf_f32m1(_sum7, r5[17], _k53, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r5[4], _k54, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r5[6], _k54, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r5[8], _k54, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r5[10], _k54, vl); - _sum4 = vfmacc_vf_f32m1(_sum4, r5[12], _k54, vl); - _sum5 = vfmacc_vf_f32m1(_sum5, r5[14], _k54, vl); - _sum6 = vfmacc_vf_f32m1(_sum6, r5[16], _k54, vl); - _sum7 = vfmacc_vf_f32m1(_sum7, r5[18], _k54, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r5[5], _k55, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r5[7], _k55, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r5[9], _k55, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r5[11], _k55, vl); - _sum4 = vfmacc_vf_f32m1(_sum4, r5[13], _k55, vl); - _sum5 = vfmacc_vf_f32m1(_sum5, r5[15], _k55, vl); - _sum6 = vfmacc_vf_f32m1(_sum6, r5[17], _k55, vl); - _sum7 = vfmacc_vf_f32m1(_sum7, r5[19], _k55, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r5[6], _k56, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r5[8], _k56, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r5[10], _k56, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r5[12], _k56, vl); - _sum4 = vfmacc_vf_f32m1(_sum4, r5[14], _k56, vl); - _sum5 = vfmacc_vf_f32m1(_sum5, r5[16], _k56, vl); - _sum6 = vfmacc_vf_f32m1(_sum6, r5[18], _k56, vl); - _sum7 = vfmacc_vf_f32m1(_sum7, r5[20], _k56, vl); - - vfloat32m1_t _k60 = vle32_v_f32m1(kptr, vl); - vfloat32m1_t _k61 = vle32_v_f32m1(kptr + packn, vl); - vfloat32m1_t _k62 = vle32_v_f32m1(kptr + packn * 2, vl); - vfloat32m1_t _k63 = vle32_v_f32m1(kptr + packn * 3, vl); - vfloat32m1_t _k64 = vle32_v_f32m1(kptr + packn * 4, vl); - vfloat32m1_t _k65 = vle32_v_f32m1(kptr + packn * 5, vl); - vfloat32m1_t _k66 = vle32_v_f32m1(kptr + packn * 6, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r5[0], _k50, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r5[2], _k50, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r5[4], _k50, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r5[6], _k50, vl); + _sum4 = __riscv_vfmacc_vf_f32m1(_sum4, r5[8], _k50, vl); + _sum5 = __riscv_vfmacc_vf_f32m1(_sum5, r5[10], _k50, vl); + _sum6 = __riscv_vfmacc_vf_f32m1(_sum6, r5[12], _k50, vl); + _sum7 = __riscv_vfmacc_vf_f32m1(_sum7, r5[14], _k50, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r5[1], _k51, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r5[3], _k51, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r5[5], _k51, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r5[7], _k51, vl); + _sum4 = __riscv_vfmacc_vf_f32m1(_sum4, r5[9], _k51, vl); + _sum5 = __riscv_vfmacc_vf_f32m1(_sum5, r5[11], _k51, vl); + _sum6 = __riscv_vfmacc_vf_f32m1(_sum6, r5[13], _k51, vl); + _sum7 = __riscv_vfmacc_vf_f32m1(_sum7, r5[15], _k51, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r5[2], _k52, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r5[4], _k52, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r5[6], _k52, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r5[8], _k52, vl); + _sum4 = __riscv_vfmacc_vf_f32m1(_sum4, r5[10], _k52, vl); + _sum5 = __riscv_vfmacc_vf_f32m1(_sum5, r5[12], _k52, vl); + _sum6 = __riscv_vfmacc_vf_f32m1(_sum6, r5[14], _k52, vl); + _sum7 = __riscv_vfmacc_vf_f32m1(_sum7, r5[16], _k52, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r5[3], _k53, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r5[5], _k53, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r5[7], _k53, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r5[9], _k53, vl); + _sum4 = __riscv_vfmacc_vf_f32m1(_sum4, r5[11], _k53, vl); + _sum5 = __riscv_vfmacc_vf_f32m1(_sum5, r5[13], _k53, vl); + _sum6 = __riscv_vfmacc_vf_f32m1(_sum6, r5[15], _k53, vl); + _sum7 = __riscv_vfmacc_vf_f32m1(_sum7, r5[17], _k53, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r5[4], _k54, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r5[6], _k54, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r5[8], _k54, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r5[10], _k54, vl); + _sum4 = __riscv_vfmacc_vf_f32m1(_sum4, r5[12], _k54, vl); + _sum5 = __riscv_vfmacc_vf_f32m1(_sum5, r5[14], _k54, vl); + _sum6 = __riscv_vfmacc_vf_f32m1(_sum6, r5[16], _k54, vl); + _sum7 = __riscv_vfmacc_vf_f32m1(_sum7, r5[18], _k54, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r5[5], _k55, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r5[7], _k55, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r5[9], _k55, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r5[11], _k55, vl); + _sum4 = __riscv_vfmacc_vf_f32m1(_sum4, r5[13], _k55, vl); + _sum5 = __riscv_vfmacc_vf_f32m1(_sum5, r5[15], _k55, vl); + _sum6 = __riscv_vfmacc_vf_f32m1(_sum6, r5[17], _k55, vl); + _sum7 = __riscv_vfmacc_vf_f32m1(_sum7, r5[19], _k55, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r5[6], _k56, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r5[8], _k56, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r5[10], _k56, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r5[12], _k56, vl); + _sum4 = __riscv_vfmacc_vf_f32m1(_sum4, r5[14], _k56, vl); + _sum5 = __riscv_vfmacc_vf_f32m1(_sum5, r5[16], _k56, vl); + _sum6 = __riscv_vfmacc_vf_f32m1(_sum6, r5[18], _k56, vl); + _sum7 = __riscv_vfmacc_vf_f32m1(_sum7, r5[20], _k56, vl); + + vfloat32m1_t _k60 = __riscv_vle32_v_f32m1(kptr, vl); + vfloat32m1_t _k61 = __riscv_vle32_v_f32m1(kptr + packn, vl); + vfloat32m1_t _k62 = __riscv_vle32_v_f32m1(kptr + packn * 2, vl); + vfloat32m1_t _k63 = __riscv_vle32_v_f32m1(kptr + packn * 3, vl); + vfloat32m1_t _k64 = __riscv_vle32_v_f32m1(kptr + packn * 4, vl); + vfloat32m1_t _k65 = __riscv_vle32_v_f32m1(kptr + packn * 5, vl); + vfloat32m1_t _k66 = __riscv_vle32_v_f32m1(kptr + packn * 6, vl); kptr -= packn * 42; - _sum0 = vfmacc_vf_f32m1(_sum0, r6[0], _k60, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r6[2], _k60, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r6[4], _k60, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r6[6], _k60, vl); - _sum4 = vfmacc_vf_f32m1(_sum4, r6[8], _k60, vl); - _sum5 = vfmacc_vf_f32m1(_sum5, r6[10], _k60, vl); - _sum6 = vfmacc_vf_f32m1(_sum6, r6[12], _k60, vl); - _sum7 = vfmacc_vf_f32m1(_sum7, r6[14], _k60, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r6[1], _k61, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r6[3], _k61, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r6[5], _k61, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r6[7], _k61, vl); - _sum4 = vfmacc_vf_f32m1(_sum4, r6[9], _k61, vl); - _sum5 = vfmacc_vf_f32m1(_sum5, r6[11], _k61, vl); - _sum6 = vfmacc_vf_f32m1(_sum6, r6[13], _k61, vl); - _sum7 = vfmacc_vf_f32m1(_sum7, r6[15], _k61, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r6[2], _k62, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r6[4], _k62, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r6[6], _k62, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r6[8], _k62, vl); - _sum4 = vfmacc_vf_f32m1(_sum4, r6[10], _k62, vl); - _sum5 = vfmacc_vf_f32m1(_sum5, r6[12], _k62, vl); - _sum6 = vfmacc_vf_f32m1(_sum6, r6[14], _k62, vl); - _sum7 = vfmacc_vf_f32m1(_sum7, r6[16], _k62, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r6[3], _k63, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r6[5], _k63, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r6[7], _k63, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r6[9], _k63, vl); - _sum4 = vfmacc_vf_f32m1(_sum4, r6[11], _k63, vl); - _sum5 = vfmacc_vf_f32m1(_sum5, r6[13], _k63, vl); - _sum6 = vfmacc_vf_f32m1(_sum6, r6[15], _k63, vl); - _sum7 = vfmacc_vf_f32m1(_sum7, r6[17], _k63, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r6[4], _k64, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r6[6], _k64, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r6[8], _k64, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r6[10], _k64, vl); - _sum4 = vfmacc_vf_f32m1(_sum4, r6[12], _k64, vl); - _sum5 = vfmacc_vf_f32m1(_sum5, r6[14], _k64, vl); - _sum6 = vfmacc_vf_f32m1(_sum6, r6[16], _k64, vl); - _sum7 = vfmacc_vf_f32m1(_sum7, r6[18], _k64, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r6[5], _k65, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r6[7], _k65, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r6[9], _k65, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r6[11], _k65, vl); - _sum4 = vfmacc_vf_f32m1(_sum4, r6[13], _k65, vl); - _sum5 = vfmacc_vf_f32m1(_sum5, r6[15], _k65, vl); - _sum6 = vfmacc_vf_f32m1(_sum6, r6[17], _k65, vl); - _sum7 = vfmacc_vf_f32m1(_sum7, r6[19], _k65, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r6[6], _k66, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r6[8], _k66, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r6[10], _k66, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r6[12], _k66, vl); - _sum4 = vfmacc_vf_f32m1(_sum4, r6[14], _k66, vl); - _sum5 = vfmacc_vf_f32m1(_sum5, r6[16], _k66, vl); - _sum6 = vfmacc_vf_f32m1(_sum6, r6[18], _k66, vl); - _sum7 = vfmacc_vf_f32m1(_sum7, r6[20], _k66, vl); - - vse32_v_f32m1(outptr0, _sum0, vl); - vse32_v_f32m1(outptr0 + packn, _sum1, vl); - vse32_v_f32m1(outptr0 + packn * 2, _sum2, vl); - vse32_v_f32m1(outptr0 + packn * 3, _sum3, vl); - vse32_v_f32m1(outptr0 + packn * 4, _sum4, vl); - vse32_v_f32m1(outptr0 + packn * 5, _sum5, vl); - vse32_v_f32m1(outptr0 + packn * 6, _sum6, vl); - vse32_v_f32m1(outptr0 + packn * 7, _sum7, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r6[0], _k60, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r6[2], _k60, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r6[4], _k60, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r6[6], _k60, vl); + _sum4 = __riscv_vfmacc_vf_f32m1(_sum4, r6[8], _k60, vl); + _sum5 = __riscv_vfmacc_vf_f32m1(_sum5, r6[10], _k60, vl); + _sum6 = __riscv_vfmacc_vf_f32m1(_sum6, r6[12], _k60, vl); + _sum7 = __riscv_vfmacc_vf_f32m1(_sum7, r6[14], _k60, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r6[1], _k61, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r6[3], _k61, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r6[5], _k61, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r6[7], _k61, vl); + _sum4 = __riscv_vfmacc_vf_f32m1(_sum4, r6[9], _k61, vl); + _sum5 = __riscv_vfmacc_vf_f32m1(_sum5, r6[11], _k61, vl); + _sum6 = __riscv_vfmacc_vf_f32m1(_sum6, r6[13], _k61, vl); + _sum7 = __riscv_vfmacc_vf_f32m1(_sum7, r6[15], _k61, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r6[2], _k62, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r6[4], _k62, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r6[6], _k62, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r6[8], _k62, vl); + _sum4 = __riscv_vfmacc_vf_f32m1(_sum4, r6[10], _k62, vl); + _sum5 = __riscv_vfmacc_vf_f32m1(_sum5, r6[12], _k62, vl); + _sum6 = __riscv_vfmacc_vf_f32m1(_sum6, r6[14], _k62, vl); + _sum7 = __riscv_vfmacc_vf_f32m1(_sum7, r6[16], _k62, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r6[3], _k63, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r6[5], _k63, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r6[7], _k63, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r6[9], _k63, vl); + _sum4 = __riscv_vfmacc_vf_f32m1(_sum4, r6[11], _k63, vl); + _sum5 = __riscv_vfmacc_vf_f32m1(_sum5, r6[13], _k63, vl); + _sum6 = __riscv_vfmacc_vf_f32m1(_sum6, r6[15], _k63, vl); + _sum7 = __riscv_vfmacc_vf_f32m1(_sum7, r6[17], _k63, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r6[4], _k64, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r6[6], _k64, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r6[8], _k64, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r6[10], _k64, vl); + _sum4 = __riscv_vfmacc_vf_f32m1(_sum4, r6[12], _k64, vl); + _sum5 = __riscv_vfmacc_vf_f32m1(_sum5, r6[14], _k64, vl); + _sum6 = __riscv_vfmacc_vf_f32m1(_sum6, r6[16], _k64, vl); + _sum7 = __riscv_vfmacc_vf_f32m1(_sum7, r6[18], _k64, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r6[5], _k65, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r6[7], _k65, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r6[9], _k65, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r6[11], _k65, vl); + _sum4 = __riscv_vfmacc_vf_f32m1(_sum4, r6[13], _k65, vl); + _sum5 = __riscv_vfmacc_vf_f32m1(_sum5, r6[15], _k65, vl); + _sum6 = __riscv_vfmacc_vf_f32m1(_sum6, r6[17], _k65, vl); + _sum7 = __riscv_vfmacc_vf_f32m1(_sum7, r6[19], _k65, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r6[6], _k66, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r6[8], _k66, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r6[10], _k66, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r6[12], _k66, vl); + _sum4 = __riscv_vfmacc_vf_f32m1(_sum4, r6[14], _k66, vl); + _sum5 = __riscv_vfmacc_vf_f32m1(_sum5, r6[16], _k66, vl); + _sum6 = __riscv_vfmacc_vf_f32m1(_sum6, r6[18], _k66, vl); + _sum7 = __riscv_vfmacc_vf_f32m1(_sum7, r6[20], _k66, vl); + + __riscv_vse32_v_f32m1(outptr0, _sum0, vl); + __riscv_vse32_v_f32m1(outptr0 + packn, _sum1, vl); + __riscv_vse32_v_f32m1(outptr0 + packn * 2, _sum2, vl); + __riscv_vse32_v_f32m1(outptr0 + packn * 3, _sum3, vl); + __riscv_vse32_v_f32m1(outptr0 + packn * 4, _sum4, vl); + __riscv_vse32_v_f32m1(outptr0 + packn * 5, _sum5, vl); + __riscv_vse32_v_f32m1(outptr0 + packn * 6, _sum6, vl); + __riscv_vse32_v_f32m1(outptr0 + packn * 7, _sum7, vl); outptr0 += packn * 8; @@ -558,288 +558,288 @@ static void conv7x7s2_pack1ton_rvv(const Mat& bottom_blob, Mat& top_blob, const } for (; j + 3 < outw; j += 4) { - vfloat32m1_t _sum0 = vle32_v_f32m1(outptr0, vl); - vfloat32m1_t _sum1 = vle32_v_f32m1(outptr0 + packn, vl); - vfloat32m1_t _sum2 = vle32_v_f32m1(outptr0 + packn * 2, vl); - vfloat32m1_t _sum3 = vle32_v_f32m1(outptr0 + packn * 3, vl); - - vfloat32m1_t _k00 = vle32_v_f32m1(kptr, vl); - vfloat32m1_t _k01 = vle32_v_f32m1(kptr + packn, vl); - vfloat32m1_t _k02 = vle32_v_f32m1(kptr + packn * 2, vl); - vfloat32m1_t _k03 = vle32_v_f32m1(kptr + packn * 3, vl); - vfloat32m1_t _k04 = vle32_v_f32m1(kptr + packn * 4, vl); - vfloat32m1_t _k05 = vle32_v_f32m1(kptr + packn * 5, vl); - vfloat32m1_t _k06 = vle32_v_f32m1(kptr + packn * 6, vl); + vfloat32m1_t _sum0 = __riscv_vle32_v_f32m1(outptr0, vl); + vfloat32m1_t _sum1 = __riscv_vle32_v_f32m1(outptr0 + packn, vl); + vfloat32m1_t _sum2 = __riscv_vle32_v_f32m1(outptr0 + packn * 2, vl); + vfloat32m1_t _sum3 = __riscv_vle32_v_f32m1(outptr0 + packn * 3, vl); + + vfloat32m1_t _k00 = __riscv_vle32_v_f32m1(kptr, vl); + vfloat32m1_t _k01 = __riscv_vle32_v_f32m1(kptr + packn, vl); + vfloat32m1_t _k02 = __riscv_vle32_v_f32m1(kptr + packn * 2, vl); + vfloat32m1_t _k03 = __riscv_vle32_v_f32m1(kptr + packn * 3, vl); + vfloat32m1_t _k04 = __riscv_vle32_v_f32m1(kptr + packn * 4, vl); + vfloat32m1_t _k05 = __riscv_vle32_v_f32m1(kptr + packn * 5, vl); + vfloat32m1_t _k06 = __riscv_vle32_v_f32m1(kptr + packn * 6, vl); kptr += packn * 7; - _sum0 = vfmacc_vf_f32m1(_sum0, r0[0], _k00, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r0[2], _k00, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r0[4], _k00, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r0[6], _k00, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r0[1], _k01, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r0[3], _k01, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r0[5], _k01, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r0[7], _k01, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r0[2], _k02, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r0[4], _k02, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r0[6], _k02, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r0[8], _k02, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r0[3], _k03, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r0[5], _k03, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r0[7], _k03, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r0[9], _k03, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r0[4], _k04, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r0[6], _k04, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r0[8], _k04, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r0[10], _k04, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r0[5], _k05, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r0[7], _k05, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r0[9], _k05, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r0[11], _k05, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r0[6], _k06, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r0[8], _k06, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r0[10], _k06, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r0[12], _k06, vl); - - vfloat32m1_t _k10 = vle32_v_f32m1(kptr, vl); - vfloat32m1_t _k11 = vle32_v_f32m1(kptr + packn, vl); - vfloat32m1_t _k12 = vle32_v_f32m1(kptr + packn * 2, vl); - vfloat32m1_t _k13 = vle32_v_f32m1(kptr + packn * 3, vl); - vfloat32m1_t _k14 = vle32_v_f32m1(kptr + packn * 4, vl); - vfloat32m1_t _k15 = vle32_v_f32m1(kptr + packn * 5, vl); - vfloat32m1_t _k16 = vle32_v_f32m1(kptr + packn * 6, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r0[0], _k00, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r0[2], _k00, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r0[4], _k00, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r0[6], _k00, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r0[1], _k01, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r0[3], _k01, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r0[5], _k01, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r0[7], _k01, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r0[2], _k02, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r0[4], _k02, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r0[6], _k02, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r0[8], _k02, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r0[3], _k03, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r0[5], _k03, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r0[7], _k03, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r0[9], _k03, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r0[4], _k04, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r0[6], _k04, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r0[8], _k04, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r0[10], _k04, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r0[5], _k05, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r0[7], _k05, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r0[9], _k05, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r0[11], _k05, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r0[6], _k06, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r0[8], _k06, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r0[10], _k06, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r0[12], _k06, vl); + + vfloat32m1_t _k10 = __riscv_vle32_v_f32m1(kptr, vl); + vfloat32m1_t _k11 = __riscv_vle32_v_f32m1(kptr + packn, vl); + vfloat32m1_t _k12 = __riscv_vle32_v_f32m1(kptr + packn * 2, vl); + vfloat32m1_t _k13 = __riscv_vle32_v_f32m1(kptr + packn * 3, vl); + vfloat32m1_t _k14 = __riscv_vle32_v_f32m1(kptr + packn * 4, vl); + vfloat32m1_t _k15 = __riscv_vle32_v_f32m1(kptr + packn * 5, vl); + vfloat32m1_t _k16 = __riscv_vle32_v_f32m1(kptr + packn * 6, vl); kptr += packn * 7; - _sum0 = vfmacc_vf_f32m1(_sum0, r1[0], _k10, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r1[2], _k10, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r1[4], _k10, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r1[6], _k10, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r1[1], _k11, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r1[3], _k11, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r1[5], _k11, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r1[7], _k11, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r1[2], _k12, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r1[4], _k12, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r1[6], _k12, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r1[8], _k12, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r1[3], _k13, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r1[5], _k13, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r1[7], _k13, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r1[9], _k13, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r1[4], _k14, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r1[6], _k14, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r1[8], _k14, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r1[10], _k14, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r1[5], _k15, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r1[7], _k15, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r1[9], _k15, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r1[11], _k15, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r1[6], _k16, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r1[8], _k16, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r1[10], _k16, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r1[12], _k16, vl); - - vfloat32m1_t _k20 = vle32_v_f32m1(kptr, vl); - vfloat32m1_t _k21 = vle32_v_f32m1(kptr + packn, vl); - vfloat32m1_t _k22 = vle32_v_f32m1(kptr + packn * 2, vl); - vfloat32m1_t _k23 = vle32_v_f32m1(kptr + packn * 3, vl); - vfloat32m1_t _k24 = vle32_v_f32m1(kptr + packn * 4, vl); - vfloat32m1_t _k25 = vle32_v_f32m1(kptr + packn * 5, vl); - vfloat32m1_t _k26 = vle32_v_f32m1(kptr + packn * 6, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r1[0], _k10, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r1[2], _k10, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r1[4], _k10, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r1[6], _k10, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r1[1], _k11, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r1[3], _k11, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r1[5], _k11, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r1[7], _k11, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r1[2], _k12, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r1[4], _k12, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r1[6], _k12, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r1[8], _k12, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r1[3], _k13, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r1[5], _k13, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r1[7], _k13, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r1[9], _k13, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r1[4], _k14, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r1[6], _k14, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r1[8], _k14, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r1[10], _k14, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r1[5], _k15, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r1[7], _k15, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r1[9], _k15, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r1[11], _k15, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r1[6], _k16, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r1[8], _k16, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r1[10], _k16, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r1[12], _k16, vl); + + vfloat32m1_t _k20 = __riscv_vle32_v_f32m1(kptr, vl); + vfloat32m1_t _k21 = __riscv_vle32_v_f32m1(kptr + packn, vl); + vfloat32m1_t _k22 = __riscv_vle32_v_f32m1(kptr + packn * 2, vl); + vfloat32m1_t _k23 = __riscv_vle32_v_f32m1(kptr + packn * 3, vl); + vfloat32m1_t _k24 = __riscv_vle32_v_f32m1(kptr + packn * 4, vl); + vfloat32m1_t _k25 = __riscv_vle32_v_f32m1(kptr + packn * 5, vl); + vfloat32m1_t _k26 = __riscv_vle32_v_f32m1(kptr + packn * 6, vl); kptr += packn * 7; - _sum0 = vfmacc_vf_f32m1(_sum0, r2[0], _k20, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r2[2], _k20, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r2[4], _k20, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r2[6], _k20, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r2[1], _k21, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r2[3], _k21, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r2[5], _k21, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r2[7], _k21, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r2[2], _k22, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r2[4], _k22, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r2[6], _k22, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r2[8], _k22, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r2[3], _k23, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r2[5], _k23, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r2[7], _k23, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r2[9], _k23, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r2[4], _k24, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r2[6], _k24, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r2[8], _k24, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r2[10], _k24, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r2[5], _k25, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r2[7], _k25, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r2[9], _k25, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r2[11], _k25, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r2[6], _k26, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r2[8], _k26, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r2[10], _k26, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r2[12], _k26, vl); - - vfloat32m1_t _k30 = vle32_v_f32m1(kptr, vl); - vfloat32m1_t _k31 = vle32_v_f32m1(kptr + packn, vl); - vfloat32m1_t _k32 = vle32_v_f32m1(kptr + packn * 2, vl); - vfloat32m1_t _k33 = vle32_v_f32m1(kptr + packn * 3, vl); - vfloat32m1_t _k34 = vle32_v_f32m1(kptr + packn * 4, vl); - vfloat32m1_t _k35 = vle32_v_f32m1(kptr + packn * 5, vl); - vfloat32m1_t _k36 = vle32_v_f32m1(kptr + packn * 6, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r2[0], _k20, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r2[2], _k20, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r2[4], _k20, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r2[6], _k20, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r2[1], _k21, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r2[3], _k21, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r2[5], _k21, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r2[7], _k21, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r2[2], _k22, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r2[4], _k22, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r2[6], _k22, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r2[8], _k22, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r2[3], _k23, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r2[5], _k23, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r2[7], _k23, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r2[9], _k23, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r2[4], _k24, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r2[6], _k24, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r2[8], _k24, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r2[10], _k24, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r2[5], _k25, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r2[7], _k25, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r2[9], _k25, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r2[11], _k25, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r2[6], _k26, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r2[8], _k26, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r2[10], _k26, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r2[12], _k26, vl); + + vfloat32m1_t _k30 = __riscv_vle32_v_f32m1(kptr, vl); + vfloat32m1_t _k31 = __riscv_vle32_v_f32m1(kptr + packn, vl); + vfloat32m1_t _k32 = __riscv_vle32_v_f32m1(kptr + packn * 2, vl); + vfloat32m1_t _k33 = __riscv_vle32_v_f32m1(kptr + packn * 3, vl); + vfloat32m1_t _k34 = __riscv_vle32_v_f32m1(kptr + packn * 4, vl); + vfloat32m1_t _k35 = __riscv_vle32_v_f32m1(kptr + packn * 5, vl); + vfloat32m1_t _k36 = __riscv_vle32_v_f32m1(kptr + packn * 6, vl); kptr += packn * 7; - _sum0 = vfmacc_vf_f32m1(_sum0, r3[0], _k30, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r3[2], _k30, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r3[4], _k30, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r3[6], _k30, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r3[1], _k31, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r3[3], _k31, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r3[5], _k31, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r3[7], _k31, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r3[2], _k32, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r3[4], _k32, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r3[6], _k32, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r3[8], _k32, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r3[3], _k33, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r3[5], _k33, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r3[7], _k33, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r3[9], _k33, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r3[4], _k34, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r3[6], _k34, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r3[8], _k34, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r3[10], _k34, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r3[5], _k35, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r3[7], _k35, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r3[9], _k35, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r3[11], _k35, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r3[6], _k36, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r3[8], _k36, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r3[10], _k36, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r3[12], _k36, vl); - - vfloat32m1_t _k40 = vle32_v_f32m1(kptr, vl); - vfloat32m1_t _k41 = vle32_v_f32m1(kptr + packn, vl); - vfloat32m1_t _k42 = vle32_v_f32m1(kptr + packn * 2, vl); - vfloat32m1_t _k43 = vle32_v_f32m1(kptr + packn * 3, vl); - vfloat32m1_t _k44 = vle32_v_f32m1(kptr + packn * 4, vl); - vfloat32m1_t _k45 = vle32_v_f32m1(kptr + packn * 5, vl); - vfloat32m1_t _k46 = vle32_v_f32m1(kptr + packn * 6, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r3[0], _k30, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r3[2], _k30, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r3[4], _k30, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r3[6], _k30, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r3[1], _k31, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r3[3], _k31, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r3[5], _k31, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r3[7], _k31, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r3[2], _k32, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r3[4], _k32, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r3[6], _k32, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r3[8], _k32, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r3[3], _k33, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r3[5], _k33, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r3[7], _k33, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r3[9], _k33, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r3[4], _k34, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r3[6], _k34, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r3[8], _k34, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r3[10], _k34, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r3[5], _k35, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r3[7], _k35, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r3[9], _k35, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r3[11], _k35, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r3[6], _k36, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r3[8], _k36, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r3[10], _k36, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r3[12], _k36, vl); + + vfloat32m1_t _k40 = __riscv_vle32_v_f32m1(kptr, vl); + vfloat32m1_t _k41 = __riscv_vle32_v_f32m1(kptr + packn, vl); + vfloat32m1_t _k42 = __riscv_vle32_v_f32m1(kptr + packn * 2, vl); + vfloat32m1_t _k43 = __riscv_vle32_v_f32m1(kptr + packn * 3, vl); + vfloat32m1_t _k44 = __riscv_vle32_v_f32m1(kptr + packn * 4, vl); + vfloat32m1_t _k45 = __riscv_vle32_v_f32m1(kptr + packn * 5, vl); + vfloat32m1_t _k46 = __riscv_vle32_v_f32m1(kptr + packn * 6, vl); kptr += packn * 7; - _sum0 = vfmacc_vf_f32m1(_sum0, r4[0], _k40, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r4[2], _k40, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r4[4], _k40, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r4[6], _k40, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r4[1], _k41, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r4[3], _k41, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r4[5], _k41, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r4[7], _k41, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r4[2], _k42, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r4[4], _k42, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r4[6], _k42, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r4[8], _k42, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r4[3], _k43, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r4[5], _k43, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r4[7], _k43, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r4[9], _k43, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r4[4], _k44, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r4[6], _k44, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r4[8], _k44, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r4[10], _k44, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r4[5], _k45, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r4[7], _k45, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r4[9], _k45, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r4[11], _k45, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r4[6], _k46, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r4[8], _k46, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r4[10], _k46, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r4[12], _k46, vl); - - vfloat32m1_t _k50 = vle32_v_f32m1(kptr, vl); - vfloat32m1_t _k51 = vle32_v_f32m1(kptr + packn, vl); - vfloat32m1_t _k52 = vle32_v_f32m1(kptr + packn * 2, vl); - vfloat32m1_t _k53 = vle32_v_f32m1(kptr + packn * 3, vl); - vfloat32m1_t _k54 = vle32_v_f32m1(kptr + packn * 4, vl); - vfloat32m1_t _k55 = vle32_v_f32m1(kptr + packn * 5, vl); - vfloat32m1_t _k56 = vle32_v_f32m1(kptr + packn * 6, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r4[0], _k40, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r4[2], _k40, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r4[4], _k40, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r4[6], _k40, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r4[1], _k41, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r4[3], _k41, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r4[5], _k41, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r4[7], _k41, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r4[2], _k42, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r4[4], _k42, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r4[6], _k42, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r4[8], _k42, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r4[3], _k43, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r4[5], _k43, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r4[7], _k43, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r4[9], _k43, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r4[4], _k44, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r4[6], _k44, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r4[8], _k44, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r4[10], _k44, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r4[5], _k45, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r4[7], _k45, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r4[9], _k45, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r4[11], _k45, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r4[6], _k46, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r4[8], _k46, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r4[10], _k46, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r4[12], _k46, vl); + + vfloat32m1_t _k50 = __riscv_vle32_v_f32m1(kptr, vl); + vfloat32m1_t _k51 = __riscv_vle32_v_f32m1(kptr + packn, vl); + vfloat32m1_t _k52 = __riscv_vle32_v_f32m1(kptr + packn * 2, vl); + vfloat32m1_t _k53 = __riscv_vle32_v_f32m1(kptr + packn * 3, vl); + vfloat32m1_t _k54 = __riscv_vle32_v_f32m1(kptr + packn * 4, vl); + vfloat32m1_t _k55 = __riscv_vle32_v_f32m1(kptr + packn * 5, vl); + vfloat32m1_t _k56 = __riscv_vle32_v_f32m1(kptr + packn * 6, vl); kptr += packn * 7; - _sum0 = vfmacc_vf_f32m1(_sum0, r5[0], _k50, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r5[2], _k50, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r5[4], _k50, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r5[6], _k50, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r5[1], _k51, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r5[3], _k51, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r5[5], _k51, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r5[7], _k51, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r5[2], _k52, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r5[4], _k52, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r5[6], _k52, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r5[8], _k52, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r5[3], _k53, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r5[5], _k53, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r5[7], _k53, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r5[9], _k53, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r5[4], _k54, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r5[6], _k54, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r5[8], _k54, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r5[10], _k54, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r5[5], _k55, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r5[7], _k55, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r5[9], _k55, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r5[11], _k55, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r5[6], _k56, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r5[8], _k56, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r5[10], _k56, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r5[12], _k56, vl); - - vfloat32m1_t _k60 = vle32_v_f32m1(kptr, vl); - vfloat32m1_t _k61 = vle32_v_f32m1(kptr + packn, vl); - vfloat32m1_t _k62 = vle32_v_f32m1(kptr + packn * 2, vl); - vfloat32m1_t _k63 = vle32_v_f32m1(kptr + packn * 3, vl); - vfloat32m1_t _k64 = vle32_v_f32m1(kptr + packn * 4, vl); - vfloat32m1_t _k65 = vle32_v_f32m1(kptr + packn * 5, vl); - vfloat32m1_t _k66 = vle32_v_f32m1(kptr + packn * 6, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r5[0], _k50, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r5[2], _k50, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r5[4], _k50, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r5[6], _k50, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r5[1], _k51, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r5[3], _k51, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r5[5], _k51, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r5[7], _k51, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r5[2], _k52, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r5[4], _k52, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r5[6], _k52, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r5[8], _k52, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r5[3], _k53, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r5[5], _k53, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r5[7], _k53, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r5[9], _k53, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r5[4], _k54, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r5[6], _k54, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r5[8], _k54, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r5[10], _k54, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r5[5], _k55, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r5[7], _k55, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r5[9], _k55, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r5[11], _k55, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r5[6], _k56, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r5[8], _k56, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r5[10], _k56, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r5[12], _k56, vl); + + vfloat32m1_t _k60 = __riscv_vle32_v_f32m1(kptr, vl); + vfloat32m1_t _k61 = __riscv_vle32_v_f32m1(kptr + packn, vl); + vfloat32m1_t _k62 = __riscv_vle32_v_f32m1(kptr + packn * 2, vl); + vfloat32m1_t _k63 = __riscv_vle32_v_f32m1(kptr + packn * 3, vl); + vfloat32m1_t _k64 = __riscv_vle32_v_f32m1(kptr + packn * 4, vl); + vfloat32m1_t _k65 = __riscv_vle32_v_f32m1(kptr + packn * 5, vl); + vfloat32m1_t _k66 = __riscv_vle32_v_f32m1(kptr + packn * 6, vl); kptr -= packn * 42; - _sum0 = vfmacc_vf_f32m1(_sum0, r6[0], _k60, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r6[2], _k60, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r6[4], _k60, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r6[6], _k60, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r6[1], _k61, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r6[3], _k61, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r6[5], _k61, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r6[7], _k61, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r6[2], _k62, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r6[4], _k62, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r6[6], _k62, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r6[8], _k62, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r6[3], _k63, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r6[5], _k63, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r6[7], _k63, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r6[9], _k63, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r6[4], _k64, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r6[6], _k64, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r6[8], _k64, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r6[10], _k64, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r6[5], _k65, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r6[7], _k65, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r6[9], _k65, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r6[11], _k65, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r6[6], _k66, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, r6[8], _k66, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, r6[10], _k66, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, r6[12], _k66, vl); - - vse32_v_f32m1(outptr0, _sum0, vl); - vse32_v_f32m1(outptr0 + packn, _sum1, vl); - vse32_v_f32m1(outptr0 + packn * 2, _sum2, vl); - vse32_v_f32m1(outptr0 + packn * 3, _sum3, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r6[0], _k60, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r6[2], _k60, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r6[4], _k60, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r6[6], _k60, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r6[1], _k61, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r6[3], _k61, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r6[5], _k61, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r6[7], _k61, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r6[2], _k62, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r6[4], _k62, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r6[6], _k62, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r6[8], _k62, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r6[3], _k63, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r6[5], _k63, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r6[7], _k63, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r6[9], _k63, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r6[4], _k64, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r6[6], _k64, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r6[8], _k64, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r6[10], _k64, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r6[5], _k65, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r6[7], _k65, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r6[9], _k65, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r6[11], _k65, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r6[6], _k66, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, r6[8], _k66, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, r6[10], _k66, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, r6[12], _k66, vl); + + __riscv_vse32_v_f32m1(outptr0, _sum0, vl); + __riscv_vse32_v_f32m1(outptr0 + packn, _sum1, vl); + __riscv_vse32_v_f32m1(outptr0 + packn * 2, _sum2, vl); + __riscv_vse32_v_f32m1(outptr0 + packn * 3, _sum3, vl); outptr0 += packn * 4; @@ -853,135 +853,135 @@ static void conv7x7s2_pack1ton_rvv(const Mat& bottom_blob, Mat& top_blob, const } for (; j < outw; j++) { - vfloat32m1_t _sum0 = vle32_v_f32m1(outptr0, vl); + vfloat32m1_t _sum0 = __riscv_vle32_v_f32m1(outptr0, vl); - vfloat32m1_t _k00 = vle32_v_f32m1(kptr, vl); - vfloat32m1_t _k01 = vle32_v_f32m1(kptr + packn, vl); - vfloat32m1_t _k02 = vle32_v_f32m1(kptr + packn * 2, vl); - vfloat32m1_t _k03 = vle32_v_f32m1(kptr + packn * 3, vl); - vfloat32m1_t _k04 = vle32_v_f32m1(kptr + packn * 4, vl); - vfloat32m1_t _k05 = vle32_v_f32m1(kptr + packn * 5, vl); - vfloat32m1_t _k06 = vle32_v_f32m1(kptr + packn * 6, vl); + vfloat32m1_t _k00 = __riscv_vle32_v_f32m1(kptr, vl); + vfloat32m1_t _k01 = __riscv_vle32_v_f32m1(kptr + packn, vl); + vfloat32m1_t _k02 = __riscv_vle32_v_f32m1(kptr + packn * 2, vl); + vfloat32m1_t _k03 = __riscv_vle32_v_f32m1(kptr + packn * 3, vl); + vfloat32m1_t _k04 = __riscv_vle32_v_f32m1(kptr + packn * 4, vl); + vfloat32m1_t _k05 = __riscv_vle32_v_f32m1(kptr + packn * 5, vl); + vfloat32m1_t _k06 = __riscv_vle32_v_f32m1(kptr + packn * 6, vl); kptr += packn * 7; - _sum0 = vfmacc_vf_f32m1(_sum0, r0[0], _k00, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r0[1], _k01, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r0[2], _k02, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r0[3], _k03, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r0[4], _k04, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r0[5], _k05, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r0[6], _k06, vl); - - vfloat32m1_t _k10 = vle32_v_f32m1(kptr, vl); - vfloat32m1_t _k11 = vle32_v_f32m1(kptr + packn, vl); - vfloat32m1_t _k12 = vle32_v_f32m1(kptr + packn * 2, vl); - vfloat32m1_t _k13 = vle32_v_f32m1(kptr + packn * 3, vl); - vfloat32m1_t _k14 = vle32_v_f32m1(kptr + packn * 4, vl); - vfloat32m1_t _k15 = vle32_v_f32m1(kptr + packn * 5, vl); - vfloat32m1_t _k16 = vle32_v_f32m1(kptr + packn * 6, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r0[0], _k00, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r0[1], _k01, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r0[2], _k02, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r0[3], _k03, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r0[4], _k04, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r0[5], _k05, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r0[6], _k06, vl); + + vfloat32m1_t _k10 = __riscv_vle32_v_f32m1(kptr, vl); + vfloat32m1_t _k11 = __riscv_vle32_v_f32m1(kptr + packn, vl); + vfloat32m1_t _k12 = __riscv_vle32_v_f32m1(kptr + packn * 2, vl); + vfloat32m1_t _k13 = __riscv_vle32_v_f32m1(kptr + packn * 3, vl); + vfloat32m1_t _k14 = __riscv_vle32_v_f32m1(kptr + packn * 4, vl); + vfloat32m1_t _k15 = __riscv_vle32_v_f32m1(kptr + packn * 5, vl); + vfloat32m1_t _k16 = __riscv_vle32_v_f32m1(kptr + packn * 6, vl); kptr += packn * 7; - _sum0 = vfmacc_vf_f32m1(_sum0, r1[0], _k10, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r1[1], _k11, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r1[2], _k12, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r1[3], _k13, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r1[4], _k14, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r1[5], _k15, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r1[6], _k16, vl); - - vfloat32m1_t _k20 = vle32_v_f32m1(kptr, vl); - vfloat32m1_t _k21 = vle32_v_f32m1(kptr + packn, vl); - vfloat32m1_t _k22 = vle32_v_f32m1(kptr + packn * 2, vl); - vfloat32m1_t _k23 = vle32_v_f32m1(kptr + packn * 3, vl); - vfloat32m1_t _k24 = vle32_v_f32m1(kptr + packn * 4, vl); - vfloat32m1_t _k25 = vle32_v_f32m1(kptr + packn * 5, vl); - vfloat32m1_t _k26 = vle32_v_f32m1(kptr + packn * 6, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r1[0], _k10, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r1[1], _k11, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r1[2], _k12, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r1[3], _k13, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r1[4], _k14, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r1[5], _k15, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r1[6], _k16, vl); + + vfloat32m1_t _k20 = __riscv_vle32_v_f32m1(kptr, vl); + vfloat32m1_t _k21 = __riscv_vle32_v_f32m1(kptr + packn, vl); + vfloat32m1_t _k22 = __riscv_vle32_v_f32m1(kptr + packn * 2, vl); + vfloat32m1_t _k23 = __riscv_vle32_v_f32m1(kptr + packn * 3, vl); + vfloat32m1_t _k24 = __riscv_vle32_v_f32m1(kptr + packn * 4, vl); + vfloat32m1_t _k25 = __riscv_vle32_v_f32m1(kptr + packn * 5, vl); + vfloat32m1_t _k26 = __riscv_vle32_v_f32m1(kptr + packn * 6, vl); kptr += packn * 7; - _sum0 = vfmacc_vf_f32m1(_sum0, r2[0], _k20, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r2[1], _k21, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r2[2], _k22, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r2[3], _k23, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r2[4], _k24, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r2[5], _k25, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r2[6], _k26, vl); - - vfloat32m1_t _k30 = vle32_v_f32m1(kptr, vl); - vfloat32m1_t _k31 = vle32_v_f32m1(kptr + packn, vl); - vfloat32m1_t _k32 = vle32_v_f32m1(kptr + packn * 2, vl); - vfloat32m1_t _k33 = vle32_v_f32m1(kptr + packn * 3, vl); - vfloat32m1_t _k34 = vle32_v_f32m1(kptr + packn * 4, vl); - vfloat32m1_t _k35 = vle32_v_f32m1(kptr + packn * 5, vl); - vfloat32m1_t _k36 = vle32_v_f32m1(kptr + packn * 6, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r2[0], _k20, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r2[1], _k21, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r2[2], _k22, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r2[3], _k23, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r2[4], _k24, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r2[5], _k25, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r2[6], _k26, vl); + + vfloat32m1_t _k30 = __riscv_vle32_v_f32m1(kptr, vl); + vfloat32m1_t _k31 = __riscv_vle32_v_f32m1(kptr + packn, vl); + vfloat32m1_t _k32 = __riscv_vle32_v_f32m1(kptr + packn * 2, vl); + vfloat32m1_t _k33 = __riscv_vle32_v_f32m1(kptr + packn * 3, vl); + vfloat32m1_t _k34 = __riscv_vle32_v_f32m1(kptr + packn * 4, vl); + vfloat32m1_t _k35 = __riscv_vle32_v_f32m1(kptr + packn * 5, vl); + vfloat32m1_t _k36 = __riscv_vle32_v_f32m1(kptr + packn * 6, vl); kptr += packn * 7; - _sum0 = vfmacc_vf_f32m1(_sum0, r3[0], _k30, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r3[1], _k31, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r3[2], _k32, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r3[3], _k33, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r3[4], _k34, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r3[5], _k35, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r3[6], _k36, vl); - - vfloat32m1_t _k40 = vle32_v_f32m1(kptr, vl); - vfloat32m1_t _k41 = vle32_v_f32m1(kptr + packn, vl); - vfloat32m1_t _k42 = vle32_v_f32m1(kptr + packn * 2, vl); - vfloat32m1_t _k43 = vle32_v_f32m1(kptr + packn * 3, vl); - vfloat32m1_t _k44 = vle32_v_f32m1(kptr + packn * 4, vl); - vfloat32m1_t _k45 = vle32_v_f32m1(kptr + packn * 5, vl); - vfloat32m1_t _k46 = vle32_v_f32m1(kptr + packn * 6, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r3[0], _k30, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r3[1], _k31, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r3[2], _k32, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r3[3], _k33, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r3[4], _k34, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r3[5], _k35, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r3[6], _k36, vl); + + vfloat32m1_t _k40 = __riscv_vle32_v_f32m1(kptr, vl); + vfloat32m1_t _k41 = __riscv_vle32_v_f32m1(kptr + packn, vl); + vfloat32m1_t _k42 = __riscv_vle32_v_f32m1(kptr + packn * 2, vl); + vfloat32m1_t _k43 = __riscv_vle32_v_f32m1(kptr + packn * 3, vl); + vfloat32m1_t _k44 = __riscv_vle32_v_f32m1(kptr + packn * 4, vl); + vfloat32m1_t _k45 = __riscv_vle32_v_f32m1(kptr + packn * 5, vl); + vfloat32m1_t _k46 = __riscv_vle32_v_f32m1(kptr + packn * 6, vl); kptr += packn * 7; - _sum0 = vfmacc_vf_f32m1(_sum0, r4[0], _k40, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r4[1], _k41, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r4[2], _k42, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r4[3], _k43, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r4[4], _k44, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r4[5], _k45, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r4[6], _k46, vl); - - vfloat32m1_t _k50 = vle32_v_f32m1(kptr, vl); - vfloat32m1_t _k51 = vle32_v_f32m1(kptr + packn, vl); - vfloat32m1_t _k52 = vle32_v_f32m1(kptr + packn * 2, vl); - vfloat32m1_t _k53 = vle32_v_f32m1(kptr + packn * 3, vl); - vfloat32m1_t _k54 = vle32_v_f32m1(kptr + packn * 4, vl); - vfloat32m1_t _k55 = vle32_v_f32m1(kptr + packn * 5, vl); - vfloat32m1_t _k56 = vle32_v_f32m1(kptr + packn * 6, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r4[0], _k40, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r4[1], _k41, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r4[2], _k42, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r4[3], _k43, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r4[4], _k44, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r4[5], _k45, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r4[6], _k46, vl); + + vfloat32m1_t _k50 = __riscv_vle32_v_f32m1(kptr, vl); + vfloat32m1_t _k51 = __riscv_vle32_v_f32m1(kptr + packn, vl); + vfloat32m1_t _k52 = __riscv_vle32_v_f32m1(kptr + packn * 2, vl); + vfloat32m1_t _k53 = __riscv_vle32_v_f32m1(kptr + packn * 3, vl); + vfloat32m1_t _k54 = __riscv_vle32_v_f32m1(kptr + packn * 4, vl); + vfloat32m1_t _k55 = __riscv_vle32_v_f32m1(kptr + packn * 5, vl); + vfloat32m1_t _k56 = __riscv_vle32_v_f32m1(kptr + packn * 6, vl); kptr += packn * 7; - _sum0 = vfmacc_vf_f32m1(_sum0, r5[0], _k50, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r5[1], _k51, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r5[2], _k52, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r5[3], _k53, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r5[4], _k54, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r5[5], _k55, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r5[6], _k56, vl); - - vfloat32m1_t _k60 = vle32_v_f32m1(kptr, vl); - vfloat32m1_t _k61 = vle32_v_f32m1(kptr + packn, vl); - vfloat32m1_t _k62 = vle32_v_f32m1(kptr + packn * 2, vl); - vfloat32m1_t _k63 = vle32_v_f32m1(kptr + packn * 3, vl); - vfloat32m1_t _k64 = vle32_v_f32m1(kptr + packn * 4, vl); - vfloat32m1_t _k65 = vle32_v_f32m1(kptr + packn * 5, vl); - vfloat32m1_t _k66 = vle32_v_f32m1(kptr + packn * 6, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r5[0], _k50, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r5[1], _k51, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r5[2], _k52, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r5[3], _k53, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r5[4], _k54, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r5[5], _k55, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r5[6], _k56, vl); + + vfloat32m1_t _k60 = __riscv_vle32_v_f32m1(kptr, vl); + vfloat32m1_t _k61 = __riscv_vle32_v_f32m1(kptr + packn, vl); + vfloat32m1_t _k62 = __riscv_vle32_v_f32m1(kptr + packn * 2, vl); + vfloat32m1_t _k63 = __riscv_vle32_v_f32m1(kptr + packn * 3, vl); + vfloat32m1_t _k64 = __riscv_vle32_v_f32m1(kptr + packn * 4, vl); + vfloat32m1_t _k65 = __riscv_vle32_v_f32m1(kptr + packn * 5, vl); + vfloat32m1_t _k66 = __riscv_vle32_v_f32m1(kptr + packn * 6, vl); kptr -= packn * 42; - _sum0 = vfmacc_vf_f32m1(_sum0, r6[0], _k60, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r6[1], _k61, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r6[2], _k62, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r6[3], _k63, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r6[4], _k64, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r6[5], _k65, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, r6[6], _k66, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r6[0], _k60, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r6[1], _k61, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r6[2], _k62, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r6[3], _k63, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r6[4], _k64, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r6[5], _k65, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, r6[6], _k66, vl); - vse32_v_f32m1(outptr0, _sum0, vl); + __riscv_vse32_v_f32m1(outptr0, _sum0, vl); outptr0 += packn; diff --git a/src/layer/riscv/convolution_7x7_pack1ton_fp16s.h b/src/layer/riscv/convolution_7x7_pack1ton_fp16s.h index 01804bf391d..bccd01b4246 100644 --- a/src/layer/riscv/convolution_7x7_pack1ton_fp16s.h +++ b/src/layer/riscv/convolution_7x7_pack1ton_fp16s.h @@ -15,7 +15,7 @@ static void conv7x7s2_pack1ton_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) { const int packn = csrr_vlenb() / 2; - const size_t vl = vsetvl_e16m1(packn); + const size_t vl = __riscv_vsetvl_e16m1(packn); int w = bottom_blob.w; int inch = bottom_blob.c; @@ -33,7 +33,7 @@ static void conv7x7s2_pack1ton_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, { Mat out0 = top_blob.channel(p); - vfloat16m1_t _bias0 = bias ? vle16_v_f16m1(bias + p * packn, vl) : vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _bias0 = bias ? __riscv_vle16_v_f16m1(bias + p * packn, vl) : __riscv_vfmv_v_f_f16m1(0.f, vl); out0.fill(_bias0); for (int q = 0; q < inch; q++) @@ -59,492 +59,492 @@ static void conv7x7s2_pack1ton_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, int j = 0; for (; j + 7 < outw; j += 8) { - vfloat16m1_t _sum0 = vle16_v_f16m1(outptr0, vl); - vfloat16m1_t _sum1 = vle16_v_f16m1(outptr0 + packn, vl); - vfloat16m1_t _sum2 = vle16_v_f16m1(outptr0 + packn * 2, vl); - vfloat16m1_t _sum3 = vle16_v_f16m1(outptr0 + packn * 3, vl); - vfloat16m1_t _sum4 = vle16_v_f16m1(outptr0 + packn * 4, vl); - vfloat16m1_t _sum5 = vle16_v_f16m1(outptr0 + packn * 5, vl); - vfloat16m1_t _sum6 = vle16_v_f16m1(outptr0 + packn * 6, vl); - vfloat16m1_t _sum7 = vle16_v_f16m1(outptr0 + packn * 7, vl); - - vfloat16m1_t _k00 = vle16_v_f16m1(kptr, vl); - vfloat16m1_t _k01 = vle16_v_f16m1(kptr + packn, vl); - vfloat16m1_t _k02 = vle16_v_f16m1(kptr + packn * 2, vl); - vfloat16m1_t _k03 = vle16_v_f16m1(kptr + packn * 3, vl); - vfloat16m1_t _k04 = vle16_v_f16m1(kptr + packn * 4, vl); - vfloat16m1_t _k05 = vle16_v_f16m1(kptr + packn * 5, vl); - vfloat16m1_t _k06 = vle16_v_f16m1(kptr + packn * 6, vl); + vfloat16m1_t _sum0 = __riscv_vle16_v_f16m1(outptr0, vl); + vfloat16m1_t _sum1 = __riscv_vle16_v_f16m1(outptr0 + packn, vl); + vfloat16m1_t _sum2 = __riscv_vle16_v_f16m1(outptr0 + packn * 2, vl); + vfloat16m1_t _sum3 = __riscv_vle16_v_f16m1(outptr0 + packn * 3, vl); + vfloat16m1_t _sum4 = __riscv_vle16_v_f16m1(outptr0 + packn * 4, vl); + vfloat16m1_t _sum5 = __riscv_vle16_v_f16m1(outptr0 + packn * 5, vl); + vfloat16m1_t _sum6 = __riscv_vle16_v_f16m1(outptr0 + packn * 6, vl); + vfloat16m1_t _sum7 = __riscv_vle16_v_f16m1(outptr0 + packn * 7, vl); + + vfloat16m1_t _k00 = __riscv_vle16_v_f16m1(kptr, vl); + vfloat16m1_t _k01 = __riscv_vle16_v_f16m1(kptr + packn, vl); + vfloat16m1_t _k02 = __riscv_vle16_v_f16m1(kptr + packn * 2, vl); + vfloat16m1_t _k03 = __riscv_vle16_v_f16m1(kptr + packn * 3, vl); + vfloat16m1_t _k04 = __riscv_vle16_v_f16m1(kptr + packn * 4, vl); + vfloat16m1_t _k05 = __riscv_vle16_v_f16m1(kptr + packn * 5, vl); + vfloat16m1_t _k06 = __riscv_vle16_v_f16m1(kptr + packn * 6, vl); kptr += packn * 7; - _sum0 = vfmacc_vf_f16m1(_sum0, r0[0], _k00, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r0[2], _k00, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r0[4], _k00, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r0[6], _k00, vl); - _sum4 = vfmacc_vf_f16m1(_sum4, r0[8], _k00, vl); - _sum5 = vfmacc_vf_f16m1(_sum5, r0[10], _k00, vl); - _sum6 = vfmacc_vf_f16m1(_sum6, r0[12], _k00, vl); - _sum7 = vfmacc_vf_f16m1(_sum7, r0[14], _k00, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r0[1], _k01, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r0[3], _k01, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r0[5], _k01, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r0[7], _k01, vl); - _sum4 = vfmacc_vf_f16m1(_sum4, r0[9], _k01, vl); - _sum5 = vfmacc_vf_f16m1(_sum5, r0[11], _k01, vl); - _sum6 = vfmacc_vf_f16m1(_sum6, r0[13], _k01, vl); - _sum7 = vfmacc_vf_f16m1(_sum7, r0[15], _k01, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r0[2], _k02, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r0[4], _k02, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r0[6], _k02, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r0[8], _k02, vl); - _sum4 = vfmacc_vf_f16m1(_sum4, r0[10], _k02, vl); - _sum5 = vfmacc_vf_f16m1(_sum5, r0[12], _k02, vl); - _sum6 = vfmacc_vf_f16m1(_sum6, r0[14], _k02, vl); - _sum7 = vfmacc_vf_f16m1(_sum7, r0[16], _k02, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r0[3], _k03, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r0[5], _k03, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r0[7], _k03, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r0[9], _k03, vl); - _sum4 = vfmacc_vf_f16m1(_sum4, r0[11], _k03, vl); - _sum5 = vfmacc_vf_f16m1(_sum5, r0[13], _k03, vl); - _sum6 = vfmacc_vf_f16m1(_sum6, r0[15], _k03, vl); - _sum7 = vfmacc_vf_f16m1(_sum7, r0[17], _k03, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r0[4], _k04, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r0[6], _k04, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r0[8], _k04, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r0[10], _k04, vl); - _sum4 = vfmacc_vf_f16m1(_sum4, r0[12], _k04, vl); - _sum5 = vfmacc_vf_f16m1(_sum5, r0[14], _k04, vl); - _sum6 = vfmacc_vf_f16m1(_sum6, r0[16], _k04, vl); - _sum7 = vfmacc_vf_f16m1(_sum7, r0[18], _k04, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r0[5], _k05, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r0[7], _k05, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r0[9], _k05, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r0[11], _k05, vl); - _sum4 = vfmacc_vf_f16m1(_sum4, r0[13], _k05, vl); - _sum5 = vfmacc_vf_f16m1(_sum5, r0[15], _k05, vl); - _sum6 = vfmacc_vf_f16m1(_sum6, r0[17], _k05, vl); - _sum7 = vfmacc_vf_f16m1(_sum7, r0[19], _k05, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r0[6], _k06, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r0[8], _k06, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r0[10], _k06, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r0[12], _k06, vl); - _sum4 = vfmacc_vf_f16m1(_sum4, r0[14], _k06, vl); - _sum5 = vfmacc_vf_f16m1(_sum5, r0[16], _k06, vl); - _sum6 = vfmacc_vf_f16m1(_sum6, r0[18], _k06, vl); - _sum7 = vfmacc_vf_f16m1(_sum7, r0[20], _k06, vl); - - vfloat16m1_t _k10 = vle16_v_f16m1(kptr, vl); - vfloat16m1_t _k11 = vle16_v_f16m1(kptr + packn, vl); - vfloat16m1_t _k12 = vle16_v_f16m1(kptr + packn * 2, vl); - vfloat16m1_t _k13 = vle16_v_f16m1(kptr + packn * 3, vl); - vfloat16m1_t _k14 = vle16_v_f16m1(kptr + packn * 4, vl); - vfloat16m1_t _k15 = vle16_v_f16m1(kptr + packn * 5, vl); - vfloat16m1_t _k16 = vle16_v_f16m1(kptr + packn * 6, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r0[0], _k00, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r0[2], _k00, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r0[4], _k00, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r0[6], _k00, vl); + _sum4 = __riscv_vfmacc_vf_f16m1(_sum4, r0[8], _k00, vl); + _sum5 = __riscv_vfmacc_vf_f16m1(_sum5, r0[10], _k00, vl); + _sum6 = __riscv_vfmacc_vf_f16m1(_sum6, r0[12], _k00, vl); + _sum7 = __riscv_vfmacc_vf_f16m1(_sum7, r0[14], _k00, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r0[1], _k01, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r0[3], _k01, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r0[5], _k01, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r0[7], _k01, vl); + _sum4 = __riscv_vfmacc_vf_f16m1(_sum4, r0[9], _k01, vl); + _sum5 = __riscv_vfmacc_vf_f16m1(_sum5, r0[11], _k01, vl); + _sum6 = __riscv_vfmacc_vf_f16m1(_sum6, r0[13], _k01, vl); + _sum7 = __riscv_vfmacc_vf_f16m1(_sum7, r0[15], _k01, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r0[2], _k02, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r0[4], _k02, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r0[6], _k02, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r0[8], _k02, vl); + _sum4 = __riscv_vfmacc_vf_f16m1(_sum4, r0[10], _k02, vl); + _sum5 = __riscv_vfmacc_vf_f16m1(_sum5, r0[12], _k02, vl); + _sum6 = __riscv_vfmacc_vf_f16m1(_sum6, r0[14], _k02, vl); + _sum7 = __riscv_vfmacc_vf_f16m1(_sum7, r0[16], _k02, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r0[3], _k03, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r0[5], _k03, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r0[7], _k03, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r0[9], _k03, vl); + _sum4 = __riscv_vfmacc_vf_f16m1(_sum4, r0[11], _k03, vl); + _sum5 = __riscv_vfmacc_vf_f16m1(_sum5, r0[13], _k03, vl); + _sum6 = __riscv_vfmacc_vf_f16m1(_sum6, r0[15], _k03, vl); + _sum7 = __riscv_vfmacc_vf_f16m1(_sum7, r0[17], _k03, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r0[4], _k04, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r0[6], _k04, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r0[8], _k04, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r0[10], _k04, vl); + _sum4 = __riscv_vfmacc_vf_f16m1(_sum4, r0[12], _k04, vl); + _sum5 = __riscv_vfmacc_vf_f16m1(_sum5, r0[14], _k04, vl); + _sum6 = __riscv_vfmacc_vf_f16m1(_sum6, r0[16], _k04, vl); + _sum7 = __riscv_vfmacc_vf_f16m1(_sum7, r0[18], _k04, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r0[5], _k05, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r0[7], _k05, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r0[9], _k05, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r0[11], _k05, vl); + _sum4 = __riscv_vfmacc_vf_f16m1(_sum4, r0[13], _k05, vl); + _sum5 = __riscv_vfmacc_vf_f16m1(_sum5, r0[15], _k05, vl); + _sum6 = __riscv_vfmacc_vf_f16m1(_sum6, r0[17], _k05, vl); + _sum7 = __riscv_vfmacc_vf_f16m1(_sum7, r0[19], _k05, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r0[6], _k06, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r0[8], _k06, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r0[10], _k06, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r0[12], _k06, vl); + _sum4 = __riscv_vfmacc_vf_f16m1(_sum4, r0[14], _k06, vl); + _sum5 = __riscv_vfmacc_vf_f16m1(_sum5, r0[16], _k06, vl); + _sum6 = __riscv_vfmacc_vf_f16m1(_sum6, r0[18], _k06, vl); + _sum7 = __riscv_vfmacc_vf_f16m1(_sum7, r0[20], _k06, vl); + + vfloat16m1_t _k10 = __riscv_vle16_v_f16m1(kptr, vl); + vfloat16m1_t _k11 = __riscv_vle16_v_f16m1(kptr + packn, vl); + vfloat16m1_t _k12 = __riscv_vle16_v_f16m1(kptr + packn * 2, vl); + vfloat16m1_t _k13 = __riscv_vle16_v_f16m1(kptr + packn * 3, vl); + vfloat16m1_t _k14 = __riscv_vle16_v_f16m1(kptr + packn * 4, vl); + vfloat16m1_t _k15 = __riscv_vle16_v_f16m1(kptr + packn * 5, vl); + vfloat16m1_t _k16 = __riscv_vle16_v_f16m1(kptr + packn * 6, vl); kptr += packn * 7; - _sum0 = vfmacc_vf_f16m1(_sum0, r1[0], _k10, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r1[2], _k10, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r1[4], _k10, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r1[6], _k10, vl); - _sum4 = vfmacc_vf_f16m1(_sum4, r1[8], _k10, vl); - _sum5 = vfmacc_vf_f16m1(_sum5, r1[10], _k10, vl); - _sum6 = vfmacc_vf_f16m1(_sum6, r1[12], _k10, vl); - _sum7 = vfmacc_vf_f16m1(_sum7, r1[14], _k10, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r1[1], _k11, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r1[3], _k11, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r1[5], _k11, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r1[7], _k11, vl); - _sum4 = vfmacc_vf_f16m1(_sum4, r1[9], _k11, vl); - _sum5 = vfmacc_vf_f16m1(_sum5, r1[11], _k11, vl); - _sum6 = vfmacc_vf_f16m1(_sum6, r1[13], _k11, vl); - _sum7 = vfmacc_vf_f16m1(_sum7, r1[15], _k11, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r1[2], _k12, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r1[4], _k12, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r1[6], _k12, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r1[8], _k12, vl); - _sum4 = vfmacc_vf_f16m1(_sum4, r1[10], _k12, vl); - _sum5 = vfmacc_vf_f16m1(_sum5, r1[12], _k12, vl); - _sum6 = vfmacc_vf_f16m1(_sum6, r1[14], _k12, vl); - _sum7 = vfmacc_vf_f16m1(_sum7, r1[16], _k12, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r1[3], _k13, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r1[5], _k13, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r1[7], _k13, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r1[9], _k13, vl); - _sum4 = vfmacc_vf_f16m1(_sum4, r1[11], _k13, vl); - _sum5 = vfmacc_vf_f16m1(_sum5, r1[13], _k13, vl); - _sum6 = vfmacc_vf_f16m1(_sum6, r1[15], _k13, vl); - _sum7 = vfmacc_vf_f16m1(_sum7, r1[17], _k13, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r1[4], _k14, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r1[6], _k14, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r1[8], _k14, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r1[10], _k14, vl); - _sum4 = vfmacc_vf_f16m1(_sum4, r1[12], _k14, vl); - _sum5 = vfmacc_vf_f16m1(_sum5, r1[14], _k14, vl); - _sum6 = vfmacc_vf_f16m1(_sum6, r1[16], _k14, vl); - _sum7 = vfmacc_vf_f16m1(_sum7, r1[18], _k14, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r1[5], _k15, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r1[7], _k15, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r1[9], _k15, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r1[11], _k15, vl); - _sum4 = vfmacc_vf_f16m1(_sum4, r1[13], _k15, vl); - _sum5 = vfmacc_vf_f16m1(_sum5, r1[15], _k15, vl); - _sum6 = vfmacc_vf_f16m1(_sum6, r1[17], _k15, vl); - _sum7 = vfmacc_vf_f16m1(_sum7, r1[19], _k15, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r1[6], _k16, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r1[8], _k16, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r1[10], _k16, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r1[12], _k16, vl); - _sum4 = vfmacc_vf_f16m1(_sum4, r1[14], _k16, vl); - _sum5 = vfmacc_vf_f16m1(_sum5, r1[16], _k16, vl); - _sum6 = vfmacc_vf_f16m1(_sum6, r1[18], _k16, vl); - _sum7 = vfmacc_vf_f16m1(_sum7, r1[20], _k16, vl); - - vfloat16m1_t _k20 = vle16_v_f16m1(kptr, vl); - vfloat16m1_t _k21 = vle16_v_f16m1(kptr + packn, vl); - vfloat16m1_t _k22 = vle16_v_f16m1(kptr + packn * 2, vl); - vfloat16m1_t _k23 = vle16_v_f16m1(kptr + packn * 3, vl); - vfloat16m1_t _k24 = vle16_v_f16m1(kptr + packn * 4, vl); - vfloat16m1_t _k25 = vle16_v_f16m1(kptr + packn * 5, vl); - vfloat16m1_t _k26 = vle16_v_f16m1(kptr + packn * 6, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r1[0], _k10, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r1[2], _k10, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r1[4], _k10, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r1[6], _k10, vl); + _sum4 = __riscv_vfmacc_vf_f16m1(_sum4, r1[8], _k10, vl); + _sum5 = __riscv_vfmacc_vf_f16m1(_sum5, r1[10], _k10, vl); + _sum6 = __riscv_vfmacc_vf_f16m1(_sum6, r1[12], _k10, vl); + _sum7 = __riscv_vfmacc_vf_f16m1(_sum7, r1[14], _k10, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r1[1], _k11, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r1[3], _k11, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r1[5], _k11, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r1[7], _k11, vl); + _sum4 = __riscv_vfmacc_vf_f16m1(_sum4, r1[9], _k11, vl); + _sum5 = __riscv_vfmacc_vf_f16m1(_sum5, r1[11], _k11, vl); + _sum6 = __riscv_vfmacc_vf_f16m1(_sum6, r1[13], _k11, vl); + _sum7 = __riscv_vfmacc_vf_f16m1(_sum7, r1[15], _k11, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r1[2], _k12, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r1[4], _k12, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r1[6], _k12, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r1[8], _k12, vl); + _sum4 = __riscv_vfmacc_vf_f16m1(_sum4, r1[10], _k12, vl); + _sum5 = __riscv_vfmacc_vf_f16m1(_sum5, r1[12], _k12, vl); + _sum6 = __riscv_vfmacc_vf_f16m1(_sum6, r1[14], _k12, vl); + _sum7 = __riscv_vfmacc_vf_f16m1(_sum7, r1[16], _k12, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r1[3], _k13, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r1[5], _k13, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r1[7], _k13, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r1[9], _k13, vl); + _sum4 = __riscv_vfmacc_vf_f16m1(_sum4, r1[11], _k13, vl); + _sum5 = __riscv_vfmacc_vf_f16m1(_sum5, r1[13], _k13, vl); + _sum6 = __riscv_vfmacc_vf_f16m1(_sum6, r1[15], _k13, vl); + _sum7 = __riscv_vfmacc_vf_f16m1(_sum7, r1[17], _k13, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r1[4], _k14, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r1[6], _k14, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r1[8], _k14, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r1[10], _k14, vl); + _sum4 = __riscv_vfmacc_vf_f16m1(_sum4, r1[12], _k14, vl); + _sum5 = __riscv_vfmacc_vf_f16m1(_sum5, r1[14], _k14, vl); + _sum6 = __riscv_vfmacc_vf_f16m1(_sum6, r1[16], _k14, vl); + _sum7 = __riscv_vfmacc_vf_f16m1(_sum7, r1[18], _k14, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r1[5], _k15, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r1[7], _k15, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r1[9], _k15, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r1[11], _k15, vl); + _sum4 = __riscv_vfmacc_vf_f16m1(_sum4, r1[13], _k15, vl); + _sum5 = __riscv_vfmacc_vf_f16m1(_sum5, r1[15], _k15, vl); + _sum6 = __riscv_vfmacc_vf_f16m1(_sum6, r1[17], _k15, vl); + _sum7 = __riscv_vfmacc_vf_f16m1(_sum7, r1[19], _k15, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r1[6], _k16, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r1[8], _k16, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r1[10], _k16, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r1[12], _k16, vl); + _sum4 = __riscv_vfmacc_vf_f16m1(_sum4, r1[14], _k16, vl); + _sum5 = __riscv_vfmacc_vf_f16m1(_sum5, r1[16], _k16, vl); + _sum6 = __riscv_vfmacc_vf_f16m1(_sum6, r1[18], _k16, vl); + _sum7 = __riscv_vfmacc_vf_f16m1(_sum7, r1[20], _k16, vl); + + vfloat16m1_t _k20 = __riscv_vle16_v_f16m1(kptr, vl); + vfloat16m1_t _k21 = __riscv_vle16_v_f16m1(kptr + packn, vl); + vfloat16m1_t _k22 = __riscv_vle16_v_f16m1(kptr + packn * 2, vl); + vfloat16m1_t _k23 = __riscv_vle16_v_f16m1(kptr + packn * 3, vl); + vfloat16m1_t _k24 = __riscv_vle16_v_f16m1(kptr + packn * 4, vl); + vfloat16m1_t _k25 = __riscv_vle16_v_f16m1(kptr + packn * 5, vl); + vfloat16m1_t _k26 = __riscv_vle16_v_f16m1(kptr + packn * 6, vl); kptr += packn * 7; - _sum0 = vfmacc_vf_f16m1(_sum0, r2[0], _k20, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r2[2], _k20, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r2[4], _k20, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r2[6], _k20, vl); - _sum4 = vfmacc_vf_f16m1(_sum4, r2[8], _k20, vl); - _sum5 = vfmacc_vf_f16m1(_sum5, r2[10], _k20, vl); - _sum6 = vfmacc_vf_f16m1(_sum6, r2[12], _k20, vl); - _sum7 = vfmacc_vf_f16m1(_sum7, r2[14], _k20, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r2[1], _k21, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r2[3], _k21, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r2[5], _k21, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r2[7], _k21, vl); - _sum4 = vfmacc_vf_f16m1(_sum4, r2[9], _k21, vl); - _sum5 = vfmacc_vf_f16m1(_sum5, r2[11], _k21, vl); - _sum6 = vfmacc_vf_f16m1(_sum6, r2[13], _k21, vl); - _sum7 = vfmacc_vf_f16m1(_sum7, r2[15], _k21, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r2[2], _k22, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r2[4], _k22, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r2[6], _k22, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r2[8], _k22, vl); - _sum4 = vfmacc_vf_f16m1(_sum4, r2[10], _k22, vl); - _sum5 = vfmacc_vf_f16m1(_sum5, r2[12], _k22, vl); - _sum6 = vfmacc_vf_f16m1(_sum6, r2[14], _k22, vl); - _sum7 = vfmacc_vf_f16m1(_sum7, r2[16], _k22, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r2[3], _k23, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r2[5], _k23, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r2[7], _k23, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r2[9], _k23, vl); - _sum4 = vfmacc_vf_f16m1(_sum4, r2[11], _k23, vl); - _sum5 = vfmacc_vf_f16m1(_sum5, r2[13], _k23, vl); - _sum6 = vfmacc_vf_f16m1(_sum6, r2[15], _k23, vl); - _sum7 = vfmacc_vf_f16m1(_sum7, r2[17], _k23, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r2[4], _k24, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r2[6], _k24, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r2[8], _k24, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r2[10], _k24, vl); - _sum4 = vfmacc_vf_f16m1(_sum4, r2[12], _k24, vl); - _sum5 = vfmacc_vf_f16m1(_sum5, r2[14], _k24, vl); - _sum6 = vfmacc_vf_f16m1(_sum6, r2[16], _k24, vl); - _sum7 = vfmacc_vf_f16m1(_sum7, r2[18], _k24, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r2[5], _k25, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r2[7], _k25, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r2[9], _k25, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r2[11], _k25, vl); - _sum4 = vfmacc_vf_f16m1(_sum4, r2[13], _k25, vl); - _sum5 = vfmacc_vf_f16m1(_sum5, r2[15], _k25, vl); - _sum6 = vfmacc_vf_f16m1(_sum6, r2[17], _k25, vl); - _sum7 = vfmacc_vf_f16m1(_sum7, r2[19], _k25, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r2[6], _k26, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r2[8], _k26, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r2[10], _k26, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r2[12], _k26, vl); - _sum4 = vfmacc_vf_f16m1(_sum4, r2[14], _k26, vl); - _sum5 = vfmacc_vf_f16m1(_sum5, r2[16], _k26, vl); - _sum6 = vfmacc_vf_f16m1(_sum6, r2[18], _k26, vl); - _sum7 = vfmacc_vf_f16m1(_sum7, r2[20], _k26, vl); - - vfloat16m1_t _k30 = vle16_v_f16m1(kptr, vl); - vfloat16m1_t _k31 = vle16_v_f16m1(kptr + packn, vl); - vfloat16m1_t _k32 = vle16_v_f16m1(kptr + packn * 2, vl); - vfloat16m1_t _k33 = vle16_v_f16m1(kptr + packn * 3, vl); - vfloat16m1_t _k34 = vle16_v_f16m1(kptr + packn * 4, vl); - vfloat16m1_t _k35 = vle16_v_f16m1(kptr + packn * 5, vl); - vfloat16m1_t _k36 = vle16_v_f16m1(kptr + packn * 6, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r2[0], _k20, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r2[2], _k20, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r2[4], _k20, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r2[6], _k20, vl); + _sum4 = __riscv_vfmacc_vf_f16m1(_sum4, r2[8], _k20, vl); + _sum5 = __riscv_vfmacc_vf_f16m1(_sum5, r2[10], _k20, vl); + _sum6 = __riscv_vfmacc_vf_f16m1(_sum6, r2[12], _k20, vl); + _sum7 = __riscv_vfmacc_vf_f16m1(_sum7, r2[14], _k20, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r2[1], _k21, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r2[3], _k21, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r2[5], _k21, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r2[7], _k21, vl); + _sum4 = __riscv_vfmacc_vf_f16m1(_sum4, r2[9], _k21, vl); + _sum5 = __riscv_vfmacc_vf_f16m1(_sum5, r2[11], _k21, vl); + _sum6 = __riscv_vfmacc_vf_f16m1(_sum6, r2[13], _k21, vl); + _sum7 = __riscv_vfmacc_vf_f16m1(_sum7, r2[15], _k21, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r2[2], _k22, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r2[4], _k22, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r2[6], _k22, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r2[8], _k22, vl); + _sum4 = __riscv_vfmacc_vf_f16m1(_sum4, r2[10], _k22, vl); + _sum5 = __riscv_vfmacc_vf_f16m1(_sum5, r2[12], _k22, vl); + _sum6 = __riscv_vfmacc_vf_f16m1(_sum6, r2[14], _k22, vl); + _sum7 = __riscv_vfmacc_vf_f16m1(_sum7, r2[16], _k22, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r2[3], _k23, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r2[5], _k23, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r2[7], _k23, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r2[9], _k23, vl); + _sum4 = __riscv_vfmacc_vf_f16m1(_sum4, r2[11], _k23, vl); + _sum5 = __riscv_vfmacc_vf_f16m1(_sum5, r2[13], _k23, vl); + _sum6 = __riscv_vfmacc_vf_f16m1(_sum6, r2[15], _k23, vl); + _sum7 = __riscv_vfmacc_vf_f16m1(_sum7, r2[17], _k23, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r2[4], _k24, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r2[6], _k24, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r2[8], _k24, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r2[10], _k24, vl); + _sum4 = __riscv_vfmacc_vf_f16m1(_sum4, r2[12], _k24, vl); + _sum5 = __riscv_vfmacc_vf_f16m1(_sum5, r2[14], _k24, vl); + _sum6 = __riscv_vfmacc_vf_f16m1(_sum6, r2[16], _k24, vl); + _sum7 = __riscv_vfmacc_vf_f16m1(_sum7, r2[18], _k24, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r2[5], _k25, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r2[7], _k25, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r2[9], _k25, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r2[11], _k25, vl); + _sum4 = __riscv_vfmacc_vf_f16m1(_sum4, r2[13], _k25, vl); + _sum5 = __riscv_vfmacc_vf_f16m1(_sum5, r2[15], _k25, vl); + _sum6 = __riscv_vfmacc_vf_f16m1(_sum6, r2[17], _k25, vl); + _sum7 = __riscv_vfmacc_vf_f16m1(_sum7, r2[19], _k25, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r2[6], _k26, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r2[8], _k26, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r2[10], _k26, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r2[12], _k26, vl); + _sum4 = __riscv_vfmacc_vf_f16m1(_sum4, r2[14], _k26, vl); + _sum5 = __riscv_vfmacc_vf_f16m1(_sum5, r2[16], _k26, vl); + _sum6 = __riscv_vfmacc_vf_f16m1(_sum6, r2[18], _k26, vl); + _sum7 = __riscv_vfmacc_vf_f16m1(_sum7, r2[20], _k26, vl); + + vfloat16m1_t _k30 = __riscv_vle16_v_f16m1(kptr, vl); + vfloat16m1_t _k31 = __riscv_vle16_v_f16m1(kptr + packn, vl); + vfloat16m1_t _k32 = __riscv_vle16_v_f16m1(kptr + packn * 2, vl); + vfloat16m1_t _k33 = __riscv_vle16_v_f16m1(kptr + packn * 3, vl); + vfloat16m1_t _k34 = __riscv_vle16_v_f16m1(kptr + packn * 4, vl); + vfloat16m1_t _k35 = __riscv_vle16_v_f16m1(kptr + packn * 5, vl); + vfloat16m1_t _k36 = __riscv_vle16_v_f16m1(kptr + packn * 6, vl); kptr += packn * 7; - _sum0 = vfmacc_vf_f16m1(_sum0, r3[0], _k30, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r3[2], _k30, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r3[4], _k30, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r3[6], _k30, vl); - _sum4 = vfmacc_vf_f16m1(_sum4, r3[8], _k30, vl); - _sum5 = vfmacc_vf_f16m1(_sum5, r3[10], _k30, vl); - _sum6 = vfmacc_vf_f16m1(_sum6, r3[12], _k30, vl); - _sum7 = vfmacc_vf_f16m1(_sum7, r3[14], _k30, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r3[1], _k31, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r3[3], _k31, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r3[5], _k31, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r3[7], _k31, vl); - _sum4 = vfmacc_vf_f16m1(_sum4, r3[9], _k31, vl); - _sum5 = vfmacc_vf_f16m1(_sum5, r3[11], _k31, vl); - _sum6 = vfmacc_vf_f16m1(_sum6, r3[13], _k31, vl); - _sum7 = vfmacc_vf_f16m1(_sum7, r3[15], _k31, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r3[2], _k32, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r3[4], _k32, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r3[6], _k32, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r3[8], _k32, vl); - _sum4 = vfmacc_vf_f16m1(_sum4, r3[10], _k32, vl); - _sum5 = vfmacc_vf_f16m1(_sum5, r3[12], _k32, vl); - _sum6 = vfmacc_vf_f16m1(_sum6, r3[14], _k32, vl); - _sum7 = vfmacc_vf_f16m1(_sum7, r3[16], _k32, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r3[3], _k33, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r3[5], _k33, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r3[7], _k33, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r3[9], _k33, vl); - _sum4 = vfmacc_vf_f16m1(_sum4, r3[11], _k33, vl); - _sum5 = vfmacc_vf_f16m1(_sum5, r3[13], _k33, vl); - _sum6 = vfmacc_vf_f16m1(_sum6, r3[15], _k33, vl); - _sum7 = vfmacc_vf_f16m1(_sum7, r3[17], _k33, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r3[4], _k34, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r3[6], _k34, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r3[8], _k34, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r3[10], _k34, vl); - _sum4 = vfmacc_vf_f16m1(_sum4, r3[12], _k34, vl); - _sum5 = vfmacc_vf_f16m1(_sum5, r3[14], _k34, vl); - _sum6 = vfmacc_vf_f16m1(_sum6, r3[16], _k34, vl); - _sum7 = vfmacc_vf_f16m1(_sum7, r3[18], _k34, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r3[5], _k35, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r3[7], _k35, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r3[9], _k35, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r3[11], _k35, vl); - _sum4 = vfmacc_vf_f16m1(_sum4, r3[13], _k35, vl); - _sum5 = vfmacc_vf_f16m1(_sum5, r3[15], _k35, vl); - _sum6 = vfmacc_vf_f16m1(_sum6, r3[17], _k35, vl); - _sum7 = vfmacc_vf_f16m1(_sum7, r3[19], _k35, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r3[6], _k36, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r3[8], _k36, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r3[10], _k36, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r3[12], _k36, vl); - _sum4 = vfmacc_vf_f16m1(_sum4, r3[14], _k36, vl); - _sum5 = vfmacc_vf_f16m1(_sum5, r3[16], _k36, vl); - _sum6 = vfmacc_vf_f16m1(_sum6, r3[18], _k36, vl); - _sum7 = vfmacc_vf_f16m1(_sum7, r3[20], _k36, vl); - - vfloat16m1_t _k40 = vle16_v_f16m1(kptr, vl); - vfloat16m1_t _k41 = vle16_v_f16m1(kptr + packn, vl); - vfloat16m1_t _k42 = vle16_v_f16m1(kptr + packn * 2, vl); - vfloat16m1_t _k43 = vle16_v_f16m1(kptr + packn * 3, vl); - vfloat16m1_t _k44 = vle16_v_f16m1(kptr + packn * 4, vl); - vfloat16m1_t _k45 = vle16_v_f16m1(kptr + packn * 5, vl); - vfloat16m1_t _k46 = vle16_v_f16m1(kptr + packn * 6, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r3[0], _k30, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r3[2], _k30, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r3[4], _k30, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r3[6], _k30, vl); + _sum4 = __riscv_vfmacc_vf_f16m1(_sum4, r3[8], _k30, vl); + _sum5 = __riscv_vfmacc_vf_f16m1(_sum5, r3[10], _k30, vl); + _sum6 = __riscv_vfmacc_vf_f16m1(_sum6, r3[12], _k30, vl); + _sum7 = __riscv_vfmacc_vf_f16m1(_sum7, r3[14], _k30, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r3[1], _k31, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r3[3], _k31, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r3[5], _k31, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r3[7], _k31, vl); + _sum4 = __riscv_vfmacc_vf_f16m1(_sum4, r3[9], _k31, vl); + _sum5 = __riscv_vfmacc_vf_f16m1(_sum5, r3[11], _k31, vl); + _sum6 = __riscv_vfmacc_vf_f16m1(_sum6, r3[13], _k31, vl); + _sum7 = __riscv_vfmacc_vf_f16m1(_sum7, r3[15], _k31, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r3[2], _k32, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r3[4], _k32, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r3[6], _k32, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r3[8], _k32, vl); + _sum4 = __riscv_vfmacc_vf_f16m1(_sum4, r3[10], _k32, vl); + _sum5 = __riscv_vfmacc_vf_f16m1(_sum5, r3[12], _k32, vl); + _sum6 = __riscv_vfmacc_vf_f16m1(_sum6, r3[14], _k32, vl); + _sum7 = __riscv_vfmacc_vf_f16m1(_sum7, r3[16], _k32, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r3[3], _k33, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r3[5], _k33, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r3[7], _k33, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r3[9], _k33, vl); + _sum4 = __riscv_vfmacc_vf_f16m1(_sum4, r3[11], _k33, vl); + _sum5 = __riscv_vfmacc_vf_f16m1(_sum5, r3[13], _k33, vl); + _sum6 = __riscv_vfmacc_vf_f16m1(_sum6, r3[15], _k33, vl); + _sum7 = __riscv_vfmacc_vf_f16m1(_sum7, r3[17], _k33, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r3[4], _k34, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r3[6], _k34, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r3[8], _k34, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r3[10], _k34, vl); + _sum4 = __riscv_vfmacc_vf_f16m1(_sum4, r3[12], _k34, vl); + _sum5 = __riscv_vfmacc_vf_f16m1(_sum5, r3[14], _k34, vl); + _sum6 = __riscv_vfmacc_vf_f16m1(_sum6, r3[16], _k34, vl); + _sum7 = __riscv_vfmacc_vf_f16m1(_sum7, r3[18], _k34, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r3[5], _k35, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r3[7], _k35, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r3[9], _k35, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r3[11], _k35, vl); + _sum4 = __riscv_vfmacc_vf_f16m1(_sum4, r3[13], _k35, vl); + _sum5 = __riscv_vfmacc_vf_f16m1(_sum5, r3[15], _k35, vl); + _sum6 = __riscv_vfmacc_vf_f16m1(_sum6, r3[17], _k35, vl); + _sum7 = __riscv_vfmacc_vf_f16m1(_sum7, r3[19], _k35, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r3[6], _k36, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r3[8], _k36, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r3[10], _k36, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r3[12], _k36, vl); + _sum4 = __riscv_vfmacc_vf_f16m1(_sum4, r3[14], _k36, vl); + _sum5 = __riscv_vfmacc_vf_f16m1(_sum5, r3[16], _k36, vl); + _sum6 = __riscv_vfmacc_vf_f16m1(_sum6, r3[18], _k36, vl); + _sum7 = __riscv_vfmacc_vf_f16m1(_sum7, r3[20], _k36, vl); + + vfloat16m1_t _k40 = __riscv_vle16_v_f16m1(kptr, vl); + vfloat16m1_t _k41 = __riscv_vle16_v_f16m1(kptr + packn, vl); + vfloat16m1_t _k42 = __riscv_vle16_v_f16m1(kptr + packn * 2, vl); + vfloat16m1_t _k43 = __riscv_vle16_v_f16m1(kptr + packn * 3, vl); + vfloat16m1_t _k44 = __riscv_vle16_v_f16m1(kptr + packn * 4, vl); + vfloat16m1_t _k45 = __riscv_vle16_v_f16m1(kptr + packn * 5, vl); + vfloat16m1_t _k46 = __riscv_vle16_v_f16m1(kptr + packn * 6, vl); kptr += packn * 7; - _sum0 = vfmacc_vf_f16m1(_sum0, r4[0], _k40, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r4[2], _k40, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r4[4], _k40, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r4[6], _k40, vl); - _sum4 = vfmacc_vf_f16m1(_sum4, r4[8], _k40, vl); - _sum5 = vfmacc_vf_f16m1(_sum5, r4[10], _k40, vl); - _sum6 = vfmacc_vf_f16m1(_sum6, r4[12], _k40, vl); - _sum7 = vfmacc_vf_f16m1(_sum7, r4[14], _k40, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r4[1], _k41, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r4[3], _k41, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r4[5], _k41, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r4[7], _k41, vl); - _sum4 = vfmacc_vf_f16m1(_sum4, r4[9], _k41, vl); - _sum5 = vfmacc_vf_f16m1(_sum5, r4[11], _k41, vl); - _sum6 = vfmacc_vf_f16m1(_sum6, r4[13], _k41, vl); - _sum7 = vfmacc_vf_f16m1(_sum7, r4[15], _k41, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r4[2], _k42, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r4[4], _k42, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r4[6], _k42, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r4[8], _k42, vl); - _sum4 = vfmacc_vf_f16m1(_sum4, r4[10], _k42, vl); - _sum5 = vfmacc_vf_f16m1(_sum5, r4[12], _k42, vl); - _sum6 = vfmacc_vf_f16m1(_sum6, r4[14], _k42, vl); - _sum7 = vfmacc_vf_f16m1(_sum7, r4[16], _k42, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r4[3], _k43, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r4[5], _k43, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r4[7], _k43, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r4[9], _k43, vl); - _sum4 = vfmacc_vf_f16m1(_sum4, r4[11], _k43, vl); - _sum5 = vfmacc_vf_f16m1(_sum5, r4[13], _k43, vl); - _sum6 = vfmacc_vf_f16m1(_sum6, r4[15], _k43, vl); - _sum7 = vfmacc_vf_f16m1(_sum7, r4[17], _k43, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r4[4], _k44, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r4[6], _k44, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r4[8], _k44, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r4[10], _k44, vl); - _sum4 = vfmacc_vf_f16m1(_sum4, r4[12], _k44, vl); - _sum5 = vfmacc_vf_f16m1(_sum5, r4[14], _k44, vl); - _sum6 = vfmacc_vf_f16m1(_sum6, r4[16], _k44, vl); - _sum7 = vfmacc_vf_f16m1(_sum7, r4[18], _k44, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r4[5], _k45, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r4[7], _k45, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r4[9], _k45, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r4[11], _k45, vl); - _sum4 = vfmacc_vf_f16m1(_sum4, r4[13], _k45, vl); - _sum5 = vfmacc_vf_f16m1(_sum5, r4[15], _k45, vl); - _sum6 = vfmacc_vf_f16m1(_sum6, r4[17], _k45, vl); - _sum7 = vfmacc_vf_f16m1(_sum7, r4[19], _k45, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r4[6], _k46, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r4[8], _k46, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r4[10], _k46, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r4[12], _k46, vl); - _sum4 = vfmacc_vf_f16m1(_sum4, r4[14], _k46, vl); - _sum5 = vfmacc_vf_f16m1(_sum5, r4[16], _k46, vl); - _sum6 = vfmacc_vf_f16m1(_sum6, r4[18], _k46, vl); - _sum7 = vfmacc_vf_f16m1(_sum7, r4[20], _k46, vl); - - vfloat16m1_t _k50 = vle16_v_f16m1(kptr, vl); - vfloat16m1_t _k51 = vle16_v_f16m1(kptr + packn, vl); - vfloat16m1_t _k52 = vle16_v_f16m1(kptr + packn * 2, vl); - vfloat16m1_t _k53 = vle16_v_f16m1(kptr + packn * 3, vl); - vfloat16m1_t _k54 = vle16_v_f16m1(kptr + packn * 4, vl); - vfloat16m1_t _k55 = vle16_v_f16m1(kptr + packn * 5, vl); - vfloat16m1_t _k56 = vle16_v_f16m1(kptr + packn * 6, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r4[0], _k40, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r4[2], _k40, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r4[4], _k40, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r4[6], _k40, vl); + _sum4 = __riscv_vfmacc_vf_f16m1(_sum4, r4[8], _k40, vl); + _sum5 = __riscv_vfmacc_vf_f16m1(_sum5, r4[10], _k40, vl); + _sum6 = __riscv_vfmacc_vf_f16m1(_sum6, r4[12], _k40, vl); + _sum7 = __riscv_vfmacc_vf_f16m1(_sum7, r4[14], _k40, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r4[1], _k41, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r4[3], _k41, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r4[5], _k41, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r4[7], _k41, vl); + _sum4 = __riscv_vfmacc_vf_f16m1(_sum4, r4[9], _k41, vl); + _sum5 = __riscv_vfmacc_vf_f16m1(_sum5, r4[11], _k41, vl); + _sum6 = __riscv_vfmacc_vf_f16m1(_sum6, r4[13], _k41, vl); + _sum7 = __riscv_vfmacc_vf_f16m1(_sum7, r4[15], _k41, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r4[2], _k42, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r4[4], _k42, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r4[6], _k42, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r4[8], _k42, vl); + _sum4 = __riscv_vfmacc_vf_f16m1(_sum4, r4[10], _k42, vl); + _sum5 = __riscv_vfmacc_vf_f16m1(_sum5, r4[12], _k42, vl); + _sum6 = __riscv_vfmacc_vf_f16m1(_sum6, r4[14], _k42, vl); + _sum7 = __riscv_vfmacc_vf_f16m1(_sum7, r4[16], _k42, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r4[3], _k43, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r4[5], _k43, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r4[7], _k43, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r4[9], _k43, vl); + _sum4 = __riscv_vfmacc_vf_f16m1(_sum4, r4[11], _k43, vl); + _sum5 = __riscv_vfmacc_vf_f16m1(_sum5, r4[13], _k43, vl); + _sum6 = __riscv_vfmacc_vf_f16m1(_sum6, r4[15], _k43, vl); + _sum7 = __riscv_vfmacc_vf_f16m1(_sum7, r4[17], _k43, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r4[4], _k44, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r4[6], _k44, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r4[8], _k44, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r4[10], _k44, vl); + _sum4 = __riscv_vfmacc_vf_f16m1(_sum4, r4[12], _k44, vl); + _sum5 = __riscv_vfmacc_vf_f16m1(_sum5, r4[14], _k44, vl); + _sum6 = __riscv_vfmacc_vf_f16m1(_sum6, r4[16], _k44, vl); + _sum7 = __riscv_vfmacc_vf_f16m1(_sum7, r4[18], _k44, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r4[5], _k45, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r4[7], _k45, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r4[9], _k45, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r4[11], _k45, vl); + _sum4 = __riscv_vfmacc_vf_f16m1(_sum4, r4[13], _k45, vl); + _sum5 = __riscv_vfmacc_vf_f16m1(_sum5, r4[15], _k45, vl); + _sum6 = __riscv_vfmacc_vf_f16m1(_sum6, r4[17], _k45, vl); + _sum7 = __riscv_vfmacc_vf_f16m1(_sum7, r4[19], _k45, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r4[6], _k46, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r4[8], _k46, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r4[10], _k46, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r4[12], _k46, vl); + _sum4 = __riscv_vfmacc_vf_f16m1(_sum4, r4[14], _k46, vl); + _sum5 = __riscv_vfmacc_vf_f16m1(_sum5, r4[16], _k46, vl); + _sum6 = __riscv_vfmacc_vf_f16m1(_sum6, r4[18], _k46, vl); + _sum7 = __riscv_vfmacc_vf_f16m1(_sum7, r4[20], _k46, vl); + + vfloat16m1_t _k50 = __riscv_vle16_v_f16m1(kptr, vl); + vfloat16m1_t _k51 = __riscv_vle16_v_f16m1(kptr + packn, vl); + vfloat16m1_t _k52 = __riscv_vle16_v_f16m1(kptr + packn * 2, vl); + vfloat16m1_t _k53 = __riscv_vle16_v_f16m1(kptr + packn * 3, vl); + vfloat16m1_t _k54 = __riscv_vle16_v_f16m1(kptr + packn * 4, vl); + vfloat16m1_t _k55 = __riscv_vle16_v_f16m1(kptr + packn * 5, vl); + vfloat16m1_t _k56 = __riscv_vle16_v_f16m1(kptr + packn * 6, vl); kptr += packn * 7; - _sum0 = vfmacc_vf_f16m1(_sum0, r5[0], _k50, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r5[2], _k50, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r5[4], _k50, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r5[6], _k50, vl); - _sum4 = vfmacc_vf_f16m1(_sum4, r5[8], _k50, vl); - _sum5 = vfmacc_vf_f16m1(_sum5, r5[10], _k50, vl); - _sum6 = vfmacc_vf_f16m1(_sum6, r5[12], _k50, vl); - _sum7 = vfmacc_vf_f16m1(_sum7, r5[14], _k50, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r5[1], _k51, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r5[3], _k51, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r5[5], _k51, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r5[7], _k51, vl); - _sum4 = vfmacc_vf_f16m1(_sum4, r5[9], _k51, vl); - _sum5 = vfmacc_vf_f16m1(_sum5, r5[11], _k51, vl); - _sum6 = vfmacc_vf_f16m1(_sum6, r5[13], _k51, vl); - _sum7 = vfmacc_vf_f16m1(_sum7, r5[15], _k51, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r5[2], _k52, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r5[4], _k52, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r5[6], _k52, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r5[8], _k52, vl); - _sum4 = vfmacc_vf_f16m1(_sum4, r5[10], _k52, vl); - _sum5 = vfmacc_vf_f16m1(_sum5, r5[12], _k52, vl); - _sum6 = vfmacc_vf_f16m1(_sum6, r5[14], _k52, vl); - _sum7 = vfmacc_vf_f16m1(_sum7, r5[16], _k52, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r5[3], _k53, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r5[5], _k53, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r5[7], _k53, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r5[9], _k53, vl); - _sum4 = vfmacc_vf_f16m1(_sum4, r5[11], _k53, vl); - _sum5 = vfmacc_vf_f16m1(_sum5, r5[13], _k53, vl); - _sum6 = vfmacc_vf_f16m1(_sum6, r5[15], _k53, vl); - _sum7 = vfmacc_vf_f16m1(_sum7, r5[17], _k53, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r5[4], _k54, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r5[6], _k54, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r5[8], _k54, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r5[10], _k54, vl); - _sum4 = vfmacc_vf_f16m1(_sum4, r5[12], _k54, vl); - _sum5 = vfmacc_vf_f16m1(_sum5, r5[14], _k54, vl); - _sum6 = vfmacc_vf_f16m1(_sum6, r5[16], _k54, vl); - _sum7 = vfmacc_vf_f16m1(_sum7, r5[18], _k54, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r5[5], _k55, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r5[7], _k55, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r5[9], _k55, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r5[11], _k55, vl); - _sum4 = vfmacc_vf_f16m1(_sum4, r5[13], _k55, vl); - _sum5 = vfmacc_vf_f16m1(_sum5, r5[15], _k55, vl); - _sum6 = vfmacc_vf_f16m1(_sum6, r5[17], _k55, vl); - _sum7 = vfmacc_vf_f16m1(_sum7, r5[19], _k55, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r5[6], _k56, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r5[8], _k56, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r5[10], _k56, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r5[12], _k56, vl); - _sum4 = vfmacc_vf_f16m1(_sum4, r5[14], _k56, vl); - _sum5 = vfmacc_vf_f16m1(_sum5, r5[16], _k56, vl); - _sum6 = vfmacc_vf_f16m1(_sum6, r5[18], _k56, vl); - _sum7 = vfmacc_vf_f16m1(_sum7, r5[20], _k56, vl); - - vfloat16m1_t _k60 = vle16_v_f16m1(kptr, vl); - vfloat16m1_t _k61 = vle16_v_f16m1(kptr + packn, vl); - vfloat16m1_t _k62 = vle16_v_f16m1(kptr + packn * 2, vl); - vfloat16m1_t _k63 = vle16_v_f16m1(kptr + packn * 3, vl); - vfloat16m1_t _k64 = vle16_v_f16m1(kptr + packn * 4, vl); - vfloat16m1_t _k65 = vle16_v_f16m1(kptr + packn * 5, vl); - vfloat16m1_t _k66 = vle16_v_f16m1(kptr + packn * 6, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r5[0], _k50, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r5[2], _k50, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r5[4], _k50, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r5[6], _k50, vl); + _sum4 = __riscv_vfmacc_vf_f16m1(_sum4, r5[8], _k50, vl); + _sum5 = __riscv_vfmacc_vf_f16m1(_sum5, r5[10], _k50, vl); + _sum6 = __riscv_vfmacc_vf_f16m1(_sum6, r5[12], _k50, vl); + _sum7 = __riscv_vfmacc_vf_f16m1(_sum7, r5[14], _k50, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r5[1], _k51, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r5[3], _k51, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r5[5], _k51, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r5[7], _k51, vl); + _sum4 = __riscv_vfmacc_vf_f16m1(_sum4, r5[9], _k51, vl); + _sum5 = __riscv_vfmacc_vf_f16m1(_sum5, r5[11], _k51, vl); + _sum6 = __riscv_vfmacc_vf_f16m1(_sum6, r5[13], _k51, vl); + _sum7 = __riscv_vfmacc_vf_f16m1(_sum7, r5[15], _k51, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r5[2], _k52, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r5[4], _k52, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r5[6], _k52, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r5[8], _k52, vl); + _sum4 = __riscv_vfmacc_vf_f16m1(_sum4, r5[10], _k52, vl); + _sum5 = __riscv_vfmacc_vf_f16m1(_sum5, r5[12], _k52, vl); + _sum6 = __riscv_vfmacc_vf_f16m1(_sum6, r5[14], _k52, vl); + _sum7 = __riscv_vfmacc_vf_f16m1(_sum7, r5[16], _k52, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r5[3], _k53, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r5[5], _k53, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r5[7], _k53, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r5[9], _k53, vl); + _sum4 = __riscv_vfmacc_vf_f16m1(_sum4, r5[11], _k53, vl); + _sum5 = __riscv_vfmacc_vf_f16m1(_sum5, r5[13], _k53, vl); + _sum6 = __riscv_vfmacc_vf_f16m1(_sum6, r5[15], _k53, vl); + _sum7 = __riscv_vfmacc_vf_f16m1(_sum7, r5[17], _k53, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r5[4], _k54, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r5[6], _k54, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r5[8], _k54, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r5[10], _k54, vl); + _sum4 = __riscv_vfmacc_vf_f16m1(_sum4, r5[12], _k54, vl); + _sum5 = __riscv_vfmacc_vf_f16m1(_sum5, r5[14], _k54, vl); + _sum6 = __riscv_vfmacc_vf_f16m1(_sum6, r5[16], _k54, vl); + _sum7 = __riscv_vfmacc_vf_f16m1(_sum7, r5[18], _k54, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r5[5], _k55, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r5[7], _k55, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r5[9], _k55, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r5[11], _k55, vl); + _sum4 = __riscv_vfmacc_vf_f16m1(_sum4, r5[13], _k55, vl); + _sum5 = __riscv_vfmacc_vf_f16m1(_sum5, r5[15], _k55, vl); + _sum6 = __riscv_vfmacc_vf_f16m1(_sum6, r5[17], _k55, vl); + _sum7 = __riscv_vfmacc_vf_f16m1(_sum7, r5[19], _k55, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r5[6], _k56, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r5[8], _k56, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r5[10], _k56, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r5[12], _k56, vl); + _sum4 = __riscv_vfmacc_vf_f16m1(_sum4, r5[14], _k56, vl); + _sum5 = __riscv_vfmacc_vf_f16m1(_sum5, r5[16], _k56, vl); + _sum6 = __riscv_vfmacc_vf_f16m1(_sum6, r5[18], _k56, vl); + _sum7 = __riscv_vfmacc_vf_f16m1(_sum7, r5[20], _k56, vl); + + vfloat16m1_t _k60 = __riscv_vle16_v_f16m1(kptr, vl); + vfloat16m1_t _k61 = __riscv_vle16_v_f16m1(kptr + packn, vl); + vfloat16m1_t _k62 = __riscv_vle16_v_f16m1(kptr + packn * 2, vl); + vfloat16m1_t _k63 = __riscv_vle16_v_f16m1(kptr + packn * 3, vl); + vfloat16m1_t _k64 = __riscv_vle16_v_f16m1(kptr + packn * 4, vl); + vfloat16m1_t _k65 = __riscv_vle16_v_f16m1(kptr + packn * 5, vl); + vfloat16m1_t _k66 = __riscv_vle16_v_f16m1(kptr + packn * 6, vl); kptr -= packn * 42; - _sum0 = vfmacc_vf_f16m1(_sum0, r6[0], _k60, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r6[2], _k60, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r6[4], _k60, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r6[6], _k60, vl); - _sum4 = vfmacc_vf_f16m1(_sum4, r6[8], _k60, vl); - _sum5 = vfmacc_vf_f16m1(_sum5, r6[10], _k60, vl); - _sum6 = vfmacc_vf_f16m1(_sum6, r6[12], _k60, vl); - _sum7 = vfmacc_vf_f16m1(_sum7, r6[14], _k60, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r6[1], _k61, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r6[3], _k61, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r6[5], _k61, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r6[7], _k61, vl); - _sum4 = vfmacc_vf_f16m1(_sum4, r6[9], _k61, vl); - _sum5 = vfmacc_vf_f16m1(_sum5, r6[11], _k61, vl); - _sum6 = vfmacc_vf_f16m1(_sum6, r6[13], _k61, vl); - _sum7 = vfmacc_vf_f16m1(_sum7, r6[15], _k61, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r6[2], _k62, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r6[4], _k62, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r6[6], _k62, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r6[8], _k62, vl); - _sum4 = vfmacc_vf_f16m1(_sum4, r6[10], _k62, vl); - _sum5 = vfmacc_vf_f16m1(_sum5, r6[12], _k62, vl); - _sum6 = vfmacc_vf_f16m1(_sum6, r6[14], _k62, vl); - _sum7 = vfmacc_vf_f16m1(_sum7, r6[16], _k62, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r6[3], _k63, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r6[5], _k63, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r6[7], _k63, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r6[9], _k63, vl); - _sum4 = vfmacc_vf_f16m1(_sum4, r6[11], _k63, vl); - _sum5 = vfmacc_vf_f16m1(_sum5, r6[13], _k63, vl); - _sum6 = vfmacc_vf_f16m1(_sum6, r6[15], _k63, vl); - _sum7 = vfmacc_vf_f16m1(_sum7, r6[17], _k63, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r6[4], _k64, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r6[6], _k64, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r6[8], _k64, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r6[10], _k64, vl); - _sum4 = vfmacc_vf_f16m1(_sum4, r6[12], _k64, vl); - _sum5 = vfmacc_vf_f16m1(_sum5, r6[14], _k64, vl); - _sum6 = vfmacc_vf_f16m1(_sum6, r6[16], _k64, vl); - _sum7 = vfmacc_vf_f16m1(_sum7, r6[18], _k64, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r6[5], _k65, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r6[7], _k65, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r6[9], _k65, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r6[11], _k65, vl); - _sum4 = vfmacc_vf_f16m1(_sum4, r6[13], _k65, vl); - _sum5 = vfmacc_vf_f16m1(_sum5, r6[15], _k65, vl); - _sum6 = vfmacc_vf_f16m1(_sum6, r6[17], _k65, vl); - _sum7 = vfmacc_vf_f16m1(_sum7, r6[19], _k65, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r6[6], _k66, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r6[8], _k66, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r6[10], _k66, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r6[12], _k66, vl); - _sum4 = vfmacc_vf_f16m1(_sum4, r6[14], _k66, vl); - _sum5 = vfmacc_vf_f16m1(_sum5, r6[16], _k66, vl); - _sum6 = vfmacc_vf_f16m1(_sum6, r6[18], _k66, vl); - _sum7 = vfmacc_vf_f16m1(_sum7, r6[20], _k66, vl); - - vse16_v_f16m1(outptr0, _sum0, vl); - vse16_v_f16m1(outptr0 + packn, _sum1, vl); - vse16_v_f16m1(outptr0 + packn * 2, _sum2, vl); - vse16_v_f16m1(outptr0 + packn * 3, _sum3, vl); - vse16_v_f16m1(outptr0 + packn * 4, _sum4, vl); - vse16_v_f16m1(outptr0 + packn * 5, _sum5, vl); - vse16_v_f16m1(outptr0 + packn * 6, _sum6, vl); - vse16_v_f16m1(outptr0 + packn * 7, _sum7, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r6[0], _k60, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r6[2], _k60, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r6[4], _k60, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r6[6], _k60, vl); + _sum4 = __riscv_vfmacc_vf_f16m1(_sum4, r6[8], _k60, vl); + _sum5 = __riscv_vfmacc_vf_f16m1(_sum5, r6[10], _k60, vl); + _sum6 = __riscv_vfmacc_vf_f16m1(_sum6, r6[12], _k60, vl); + _sum7 = __riscv_vfmacc_vf_f16m1(_sum7, r6[14], _k60, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r6[1], _k61, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r6[3], _k61, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r6[5], _k61, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r6[7], _k61, vl); + _sum4 = __riscv_vfmacc_vf_f16m1(_sum4, r6[9], _k61, vl); + _sum5 = __riscv_vfmacc_vf_f16m1(_sum5, r6[11], _k61, vl); + _sum6 = __riscv_vfmacc_vf_f16m1(_sum6, r6[13], _k61, vl); + _sum7 = __riscv_vfmacc_vf_f16m1(_sum7, r6[15], _k61, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r6[2], _k62, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r6[4], _k62, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r6[6], _k62, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r6[8], _k62, vl); + _sum4 = __riscv_vfmacc_vf_f16m1(_sum4, r6[10], _k62, vl); + _sum5 = __riscv_vfmacc_vf_f16m1(_sum5, r6[12], _k62, vl); + _sum6 = __riscv_vfmacc_vf_f16m1(_sum6, r6[14], _k62, vl); + _sum7 = __riscv_vfmacc_vf_f16m1(_sum7, r6[16], _k62, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r6[3], _k63, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r6[5], _k63, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r6[7], _k63, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r6[9], _k63, vl); + _sum4 = __riscv_vfmacc_vf_f16m1(_sum4, r6[11], _k63, vl); + _sum5 = __riscv_vfmacc_vf_f16m1(_sum5, r6[13], _k63, vl); + _sum6 = __riscv_vfmacc_vf_f16m1(_sum6, r6[15], _k63, vl); + _sum7 = __riscv_vfmacc_vf_f16m1(_sum7, r6[17], _k63, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r6[4], _k64, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r6[6], _k64, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r6[8], _k64, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r6[10], _k64, vl); + _sum4 = __riscv_vfmacc_vf_f16m1(_sum4, r6[12], _k64, vl); + _sum5 = __riscv_vfmacc_vf_f16m1(_sum5, r6[14], _k64, vl); + _sum6 = __riscv_vfmacc_vf_f16m1(_sum6, r6[16], _k64, vl); + _sum7 = __riscv_vfmacc_vf_f16m1(_sum7, r6[18], _k64, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r6[5], _k65, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r6[7], _k65, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r6[9], _k65, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r6[11], _k65, vl); + _sum4 = __riscv_vfmacc_vf_f16m1(_sum4, r6[13], _k65, vl); + _sum5 = __riscv_vfmacc_vf_f16m1(_sum5, r6[15], _k65, vl); + _sum6 = __riscv_vfmacc_vf_f16m1(_sum6, r6[17], _k65, vl); + _sum7 = __riscv_vfmacc_vf_f16m1(_sum7, r6[19], _k65, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r6[6], _k66, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r6[8], _k66, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r6[10], _k66, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r6[12], _k66, vl); + _sum4 = __riscv_vfmacc_vf_f16m1(_sum4, r6[14], _k66, vl); + _sum5 = __riscv_vfmacc_vf_f16m1(_sum5, r6[16], _k66, vl); + _sum6 = __riscv_vfmacc_vf_f16m1(_sum6, r6[18], _k66, vl); + _sum7 = __riscv_vfmacc_vf_f16m1(_sum7, r6[20], _k66, vl); + + __riscv_vse16_v_f16m1(outptr0, _sum0, vl); + __riscv_vse16_v_f16m1(outptr0 + packn, _sum1, vl); + __riscv_vse16_v_f16m1(outptr0 + packn * 2, _sum2, vl); + __riscv_vse16_v_f16m1(outptr0 + packn * 3, _sum3, vl); + __riscv_vse16_v_f16m1(outptr0 + packn * 4, _sum4, vl); + __riscv_vse16_v_f16m1(outptr0 + packn * 5, _sum5, vl); + __riscv_vse16_v_f16m1(outptr0 + packn * 6, _sum6, vl); + __riscv_vse16_v_f16m1(outptr0 + packn * 7, _sum7, vl); outptr0 += packn * 8; @@ -558,288 +558,288 @@ static void conv7x7s2_pack1ton_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, } for (; j + 3 < outw; j += 4) { - vfloat16m1_t _sum0 = vle16_v_f16m1(outptr0, vl); - vfloat16m1_t _sum1 = vle16_v_f16m1(outptr0 + packn, vl); - vfloat16m1_t _sum2 = vle16_v_f16m1(outptr0 + packn * 2, vl); - vfloat16m1_t _sum3 = vle16_v_f16m1(outptr0 + packn * 3, vl); - - vfloat16m1_t _k00 = vle16_v_f16m1(kptr, vl); - vfloat16m1_t _k01 = vle16_v_f16m1(kptr + packn, vl); - vfloat16m1_t _k02 = vle16_v_f16m1(kptr + packn * 2, vl); - vfloat16m1_t _k03 = vle16_v_f16m1(kptr + packn * 3, vl); - vfloat16m1_t _k04 = vle16_v_f16m1(kptr + packn * 4, vl); - vfloat16m1_t _k05 = vle16_v_f16m1(kptr + packn * 5, vl); - vfloat16m1_t _k06 = vle16_v_f16m1(kptr + packn * 6, vl); + vfloat16m1_t _sum0 = __riscv_vle16_v_f16m1(outptr0, vl); + vfloat16m1_t _sum1 = __riscv_vle16_v_f16m1(outptr0 + packn, vl); + vfloat16m1_t _sum2 = __riscv_vle16_v_f16m1(outptr0 + packn * 2, vl); + vfloat16m1_t _sum3 = __riscv_vle16_v_f16m1(outptr0 + packn * 3, vl); + + vfloat16m1_t _k00 = __riscv_vle16_v_f16m1(kptr, vl); + vfloat16m1_t _k01 = __riscv_vle16_v_f16m1(kptr + packn, vl); + vfloat16m1_t _k02 = __riscv_vle16_v_f16m1(kptr + packn * 2, vl); + vfloat16m1_t _k03 = __riscv_vle16_v_f16m1(kptr + packn * 3, vl); + vfloat16m1_t _k04 = __riscv_vle16_v_f16m1(kptr + packn * 4, vl); + vfloat16m1_t _k05 = __riscv_vle16_v_f16m1(kptr + packn * 5, vl); + vfloat16m1_t _k06 = __riscv_vle16_v_f16m1(kptr + packn * 6, vl); kptr += packn * 7; - _sum0 = vfmacc_vf_f16m1(_sum0, r0[0], _k00, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r0[2], _k00, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r0[4], _k00, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r0[6], _k00, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r0[1], _k01, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r0[3], _k01, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r0[5], _k01, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r0[7], _k01, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r0[2], _k02, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r0[4], _k02, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r0[6], _k02, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r0[8], _k02, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r0[3], _k03, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r0[5], _k03, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r0[7], _k03, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r0[9], _k03, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r0[4], _k04, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r0[6], _k04, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r0[8], _k04, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r0[10], _k04, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r0[5], _k05, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r0[7], _k05, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r0[9], _k05, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r0[11], _k05, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r0[6], _k06, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r0[8], _k06, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r0[10], _k06, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r0[12], _k06, vl); - - vfloat16m1_t _k10 = vle16_v_f16m1(kptr, vl); - vfloat16m1_t _k11 = vle16_v_f16m1(kptr + packn, vl); - vfloat16m1_t _k12 = vle16_v_f16m1(kptr + packn * 2, vl); - vfloat16m1_t _k13 = vle16_v_f16m1(kptr + packn * 3, vl); - vfloat16m1_t _k14 = vle16_v_f16m1(kptr + packn * 4, vl); - vfloat16m1_t _k15 = vle16_v_f16m1(kptr + packn * 5, vl); - vfloat16m1_t _k16 = vle16_v_f16m1(kptr + packn * 6, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r0[0], _k00, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r0[2], _k00, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r0[4], _k00, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r0[6], _k00, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r0[1], _k01, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r0[3], _k01, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r0[5], _k01, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r0[7], _k01, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r0[2], _k02, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r0[4], _k02, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r0[6], _k02, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r0[8], _k02, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r0[3], _k03, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r0[5], _k03, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r0[7], _k03, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r0[9], _k03, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r0[4], _k04, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r0[6], _k04, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r0[8], _k04, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r0[10], _k04, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r0[5], _k05, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r0[7], _k05, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r0[9], _k05, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r0[11], _k05, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r0[6], _k06, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r0[8], _k06, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r0[10], _k06, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r0[12], _k06, vl); + + vfloat16m1_t _k10 = __riscv_vle16_v_f16m1(kptr, vl); + vfloat16m1_t _k11 = __riscv_vle16_v_f16m1(kptr + packn, vl); + vfloat16m1_t _k12 = __riscv_vle16_v_f16m1(kptr + packn * 2, vl); + vfloat16m1_t _k13 = __riscv_vle16_v_f16m1(kptr + packn * 3, vl); + vfloat16m1_t _k14 = __riscv_vle16_v_f16m1(kptr + packn * 4, vl); + vfloat16m1_t _k15 = __riscv_vle16_v_f16m1(kptr + packn * 5, vl); + vfloat16m1_t _k16 = __riscv_vle16_v_f16m1(kptr + packn * 6, vl); kptr += packn * 7; - _sum0 = vfmacc_vf_f16m1(_sum0, r1[0], _k10, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r1[2], _k10, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r1[4], _k10, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r1[6], _k10, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r1[1], _k11, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r1[3], _k11, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r1[5], _k11, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r1[7], _k11, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r1[2], _k12, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r1[4], _k12, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r1[6], _k12, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r1[8], _k12, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r1[3], _k13, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r1[5], _k13, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r1[7], _k13, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r1[9], _k13, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r1[4], _k14, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r1[6], _k14, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r1[8], _k14, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r1[10], _k14, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r1[5], _k15, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r1[7], _k15, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r1[9], _k15, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r1[11], _k15, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r1[6], _k16, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r1[8], _k16, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r1[10], _k16, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r1[12], _k16, vl); - - vfloat16m1_t _k20 = vle16_v_f16m1(kptr, vl); - vfloat16m1_t _k21 = vle16_v_f16m1(kptr + packn, vl); - vfloat16m1_t _k22 = vle16_v_f16m1(kptr + packn * 2, vl); - vfloat16m1_t _k23 = vle16_v_f16m1(kptr + packn * 3, vl); - vfloat16m1_t _k24 = vle16_v_f16m1(kptr + packn * 4, vl); - vfloat16m1_t _k25 = vle16_v_f16m1(kptr + packn * 5, vl); - vfloat16m1_t _k26 = vle16_v_f16m1(kptr + packn * 6, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r1[0], _k10, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r1[2], _k10, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r1[4], _k10, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r1[6], _k10, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r1[1], _k11, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r1[3], _k11, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r1[5], _k11, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r1[7], _k11, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r1[2], _k12, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r1[4], _k12, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r1[6], _k12, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r1[8], _k12, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r1[3], _k13, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r1[5], _k13, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r1[7], _k13, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r1[9], _k13, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r1[4], _k14, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r1[6], _k14, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r1[8], _k14, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r1[10], _k14, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r1[5], _k15, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r1[7], _k15, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r1[9], _k15, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r1[11], _k15, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r1[6], _k16, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r1[8], _k16, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r1[10], _k16, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r1[12], _k16, vl); + + vfloat16m1_t _k20 = __riscv_vle16_v_f16m1(kptr, vl); + vfloat16m1_t _k21 = __riscv_vle16_v_f16m1(kptr + packn, vl); + vfloat16m1_t _k22 = __riscv_vle16_v_f16m1(kptr + packn * 2, vl); + vfloat16m1_t _k23 = __riscv_vle16_v_f16m1(kptr + packn * 3, vl); + vfloat16m1_t _k24 = __riscv_vle16_v_f16m1(kptr + packn * 4, vl); + vfloat16m1_t _k25 = __riscv_vle16_v_f16m1(kptr + packn * 5, vl); + vfloat16m1_t _k26 = __riscv_vle16_v_f16m1(kptr + packn * 6, vl); kptr += packn * 7; - _sum0 = vfmacc_vf_f16m1(_sum0, r2[0], _k20, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r2[2], _k20, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r2[4], _k20, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r2[6], _k20, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r2[1], _k21, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r2[3], _k21, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r2[5], _k21, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r2[7], _k21, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r2[2], _k22, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r2[4], _k22, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r2[6], _k22, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r2[8], _k22, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r2[3], _k23, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r2[5], _k23, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r2[7], _k23, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r2[9], _k23, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r2[4], _k24, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r2[6], _k24, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r2[8], _k24, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r2[10], _k24, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r2[5], _k25, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r2[7], _k25, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r2[9], _k25, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r2[11], _k25, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r2[6], _k26, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r2[8], _k26, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r2[10], _k26, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r2[12], _k26, vl); - - vfloat16m1_t _k30 = vle16_v_f16m1(kptr, vl); - vfloat16m1_t _k31 = vle16_v_f16m1(kptr + packn, vl); - vfloat16m1_t _k32 = vle16_v_f16m1(kptr + packn * 2, vl); - vfloat16m1_t _k33 = vle16_v_f16m1(kptr + packn * 3, vl); - vfloat16m1_t _k34 = vle16_v_f16m1(kptr + packn * 4, vl); - vfloat16m1_t _k35 = vle16_v_f16m1(kptr + packn * 5, vl); - vfloat16m1_t _k36 = vle16_v_f16m1(kptr + packn * 6, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r2[0], _k20, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r2[2], _k20, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r2[4], _k20, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r2[6], _k20, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r2[1], _k21, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r2[3], _k21, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r2[5], _k21, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r2[7], _k21, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r2[2], _k22, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r2[4], _k22, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r2[6], _k22, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r2[8], _k22, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r2[3], _k23, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r2[5], _k23, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r2[7], _k23, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r2[9], _k23, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r2[4], _k24, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r2[6], _k24, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r2[8], _k24, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r2[10], _k24, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r2[5], _k25, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r2[7], _k25, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r2[9], _k25, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r2[11], _k25, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r2[6], _k26, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r2[8], _k26, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r2[10], _k26, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r2[12], _k26, vl); + + vfloat16m1_t _k30 = __riscv_vle16_v_f16m1(kptr, vl); + vfloat16m1_t _k31 = __riscv_vle16_v_f16m1(kptr + packn, vl); + vfloat16m1_t _k32 = __riscv_vle16_v_f16m1(kptr + packn * 2, vl); + vfloat16m1_t _k33 = __riscv_vle16_v_f16m1(kptr + packn * 3, vl); + vfloat16m1_t _k34 = __riscv_vle16_v_f16m1(kptr + packn * 4, vl); + vfloat16m1_t _k35 = __riscv_vle16_v_f16m1(kptr + packn * 5, vl); + vfloat16m1_t _k36 = __riscv_vle16_v_f16m1(kptr + packn * 6, vl); kptr += packn * 7; - _sum0 = vfmacc_vf_f16m1(_sum0, r3[0], _k30, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r3[2], _k30, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r3[4], _k30, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r3[6], _k30, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r3[1], _k31, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r3[3], _k31, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r3[5], _k31, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r3[7], _k31, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r3[2], _k32, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r3[4], _k32, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r3[6], _k32, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r3[8], _k32, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r3[3], _k33, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r3[5], _k33, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r3[7], _k33, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r3[9], _k33, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r3[4], _k34, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r3[6], _k34, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r3[8], _k34, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r3[10], _k34, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r3[5], _k35, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r3[7], _k35, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r3[9], _k35, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r3[11], _k35, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r3[6], _k36, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r3[8], _k36, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r3[10], _k36, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r3[12], _k36, vl); - - vfloat16m1_t _k40 = vle16_v_f16m1(kptr, vl); - vfloat16m1_t _k41 = vle16_v_f16m1(kptr + packn, vl); - vfloat16m1_t _k42 = vle16_v_f16m1(kptr + packn * 2, vl); - vfloat16m1_t _k43 = vle16_v_f16m1(kptr + packn * 3, vl); - vfloat16m1_t _k44 = vle16_v_f16m1(kptr + packn * 4, vl); - vfloat16m1_t _k45 = vle16_v_f16m1(kptr + packn * 5, vl); - vfloat16m1_t _k46 = vle16_v_f16m1(kptr + packn * 6, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r3[0], _k30, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r3[2], _k30, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r3[4], _k30, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r3[6], _k30, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r3[1], _k31, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r3[3], _k31, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r3[5], _k31, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r3[7], _k31, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r3[2], _k32, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r3[4], _k32, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r3[6], _k32, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r3[8], _k32, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r3[3], _k33, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r3[5], _k33, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r3[7], _k33, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r3[9], _k33, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r3[4], _k34, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r3[6], _k34, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r3[8], _k34, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r3[10], _k34, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r3[5], _k35, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r3[7], _k35, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r3[9], _k35, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r3[11], _k35, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r3[6], _k36, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r3[8], _k36, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r3[10], _k36, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r3[12], _k36, vl); + + vfloat16m1_t _k40 = __riscv_vle16_v_f16m1(kptr, vl); + vfloat16m1_t _k41 = __riscv_vle16_v_f16m1(kptr + packn, vl); + vfloat16m1_t _k42 = __riscv_vle16_v_f16m1(kptr + packn * 2, vl); + vfloat16m1_t _k43 = __riscv_vle16_v_f16m1(kptr + packn * 3, vl); + vfloat16m1_t _k44 = __riscv_vle16_v_f16m1(kptr + packn * 4, vl); + vfloat16m1_t _k45 = __riscv_vle16_v_f16m1(kptr + packn * 5, vl); + vfloat16m1_t _k46 = __riscv_vle16_v_f16m1(kptr + packn * 6, vl); kptr += packn * 7; - _sum0 = vfmacc_vf_f16m1(_sum0, r4[0], _k40, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r4[2], _k40, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r4[4], _k40, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r4[6], _k40, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r4[1], _k41, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r4[3], _k41, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r4[5], _k41, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r4[7], _k41, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r4[2], _k42, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r4[4], _k42, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r4[6], _k42, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r4[8], _k42, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r4[3], _k43, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r4[5], _k43, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r4[7], _k43, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r4[9], _k43, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r4[4], _k44, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r4[6], _k44, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r4[8], _k44, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r4[10], _k44, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r4[5], _k45, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r4[7], _k45, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r4[9], _k45, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r4[11], _k45, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r4[6], _k46, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r4[8], _k46, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r4[10], _k46, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r4[12], _k46, vl); - - vfloat16m1_t _k50 = vle16_v_f16m1(kptr, vl); - vfloat16m1_t _k51 = vle16_v_f16m1(kptr + packn, vl); - vfloat16m1_t _k52 = vle16_v_f16m1(kptr + packn * 2, vl); - vfloat16m1_t _k53 = vle16_v_f16m1(kptr + packn * 3, vl); - vfloat16m1_t _k54 = vle16_v_f16m1(kptr + packn * 4, vl); - vfloat16m1_t _k55 = vle16_v_f16m1(kptr + packn * 5, vl); - vfloat16m1_t _k56 = vle16_v_f16m1(kptr + packn * 6, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r4[0], _k40, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r4[2], _k40, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r4[4], _k40, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r4[6], _k40, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r4[1], _k41, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r4[3], _k41, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r4[5], _k41, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r4[7], _k41, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r4[2], _k42, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r4[4], _k42, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r4[6], _k42, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r4[8], _k42, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r4[3], _k43, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r4[5], _k43, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r4[7], _k43, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r4[9], _k43, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r4[4], _k44, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r4[6], _k44, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r4[8], _k44, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r4[10], _k44, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r4[5], _k45, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r4[7], _k45, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r4[9], _k45, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r4[11], _k45, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r4[6], _k46, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r4[8], _k46, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r4[10], _k46, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r4[12], _k46, vl); + + vfloat16m1_t _k50 = __riscv_vle16_v_f16m1(kptr, vl); + vfloat16m1_t _k51 = __riscv_vle16_v_f16m1(kptr + packn, vl); + vfloat16m1_t _k52 = __riscv_vle16_v_f16m1(kptr + packn * 2, vl); + vfloat16m1_t _k53 = __riscv_vle16_v_f16m1(kptr + packn * 3, vl); + vfloat16m1_t _k54 = __riscv_vle16_v_f16m1(kptr + packn * 4, vl); + vfloat16m1_t _k55 = __riscv_vle16_v_f16m1(kptr + packn * 5, vl); + vfloat16m1_t _k56 = __riscv_vle16_v_f16m1(kptr + packn * 6, vl); kptr += packn * 7; - _sum0 = vfmacc_vf_f16m1(_sum0, r5[0], _k50, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r5[2], _k50, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r5[4], _k50, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r5[6], _k50, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r5[1], _k51, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r5[3], _k51, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r5[5], _k51, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r5[7], _k51, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r5[2], _k52, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r5[4], _k52, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r5[6], _k52, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r5[8], _k52, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r5[3], _k53, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r5[5], _k53, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r5[7], _k53, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r5[9], _k53, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r5[4], _k54, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r5[6], _k54, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r5[8], _k54, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r5[10], _k54, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r5[5], _k55, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r5[7], _k55, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r5[9], _k55, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r5[11], _k55, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r5[6], _k56, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r5[8], _k56, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r5[10], _k56, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r5[12], _k56, vl); - - vfloat16m1_t _k60 = vle16_v_f16m1(kptr, vl); - vfloat16m1_t _k61 = vle16_v_f16m1(kptr + packn, vl); - vfloat16m1_t _k62 = vle16_v_f16m1(kptr + packn * 2, vl); - vfloat16m1_t _k63 = vle16_v_f16m1(kptr + packn * 3, vl); - vfloat16m1_t _k64 = vle16_v_f16m1(kptr + packn * 4, vl); - vfloat16m1_t _k65 = vle16_v_f16m1(kptr + packn * 5, vl); - vfloat16m1_t _k66 = vle16_v_f16m1(kptr + packn * 6, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r5[0], _k50, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r5[2], _k50, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r5[4], _k50, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r5[6], _k50, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r5[1], _k51, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r5[3], _k51, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r5[5], _k51, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r5[7], _k51, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r5[2], _k52, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r5[4], _k52, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r5[6], _k52, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r5[8], _k52, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r5[3], _k53, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r5[5], _k53, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r5[7], _k53, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r5[9], _k53, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r5[4], _k54, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r5[6], _k54, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r5[8], _k54, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r5[10], _k54, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r5[5], _k55, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r5[7], _k55, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r5[9], _k55, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r5[11], _k55, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r5[6], _k56, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r5[8], _k56, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r5[10], _k56, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r5[12], _k56, vl); + + vfloat16m1_t _k60 = __riscv_vle16_v_f16m1(kptr, vl); + vfloat16m1_t _k61 = __riscv_vle16_v_f16m1(kptr + packn, vl); + vfloat16m1_t _k62 = __riscv_vle16_v_f16m1(kptr + packn * 2, vl); + vfloat16m1_t _k63 = __riscv_vle16_v_f16m1(kptr + packn * 3, vl); + vfloat16m1_t _k64 = __riscv_vle16_v_f16m1(kptr + packn * 4, vl); + vfloat16m1_t _k65 = __riscv_vle16_v_f16m1(kptr + packn * 5, vl); + vfloat16m1_t _k66 = __riscv_vle16_v_f16m1(kptr + packn * 6, vl); kptr -= packn * 42; - _sum0 = vfmacc_vf_f16m1(_sum0, r6[0], _k60, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r6[2], _k60, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r6[4], _k60, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r6[6], _k60, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r6[1], _k61, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r6[3], _k61, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r6[5], _k61, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r6[7], _k61, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r6[2], _k62, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r6[4], _k62, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r6[6], _k62, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r6[8], _k62, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r6[3], _k63, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r6[5], _k63, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r6[7], _k63, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r6[9], _k63, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r6[4], _k64, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r6[6], _k64, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r6[8], _k64, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r6[10], _k64, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r6[5], _k65, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r6[7], _k65, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r6[9], _k65, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r6[11], _k65, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r6[6], _k66, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, r6[8], _k66, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, r6[10], _k66, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, r6[12], _k66, vl); - - vse16_v_f16m1(outptr0, _sum0, vl); - vse16_v_f16m1(outptr0 + packn, _sum1, vl); - vse16_v_f16m1(outptr0 + packn * 2, _sum2, vl); - vse16_v_f16m1(outptr0 + packn * 3, _sum3, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r6[0], _k60, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r6[2], _k60, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r6[4], _k60, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r6[6], _k60, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r6[1], _k61, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r6[3], _k61, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r6[5], _k61, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r6[7], _k61, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r6[2], _k62, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r6[4], _k62, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r6[6], _k62, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r6[8], _k62, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r6[3], _k63, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r6[5], _k63, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r6[7], _k63, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r6[9], _k63, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r6[4], _k64, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r6[6], _k64, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r6[8], _k64, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r6[10], _k64, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r6[5], _k65, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r6[7], _k65, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r6[9], _k65, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r6[11], _k65, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r6[6], _k66, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, r6[8], _k66, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, r6[10], _k66, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, r6[12], _k66, vl); + + __riscv_vse16_v_f16m1(outptr0, _sum0, vl); + __riscv_vse16_v_f16m1(outptr0 + packn, _sum1, vl); + __riscv_vse16_v_f16m1(outptr0 + packn * 2, _sum2, vl); + __riscv_vse16_v_f16m1(outptr0 + packn * 3, _sum3, vl); outptr0 += packn * 4; @@ -853,135 +853,135 @@ static void conv7x7s2_pack1ton_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, } for (; j < outw; j++) { - vfloat16m1_t _sum0 = vle16_v_f16m1(outptr0, vl); + vfloat16m1_t _sum0 = __riscv_vle16_v_f16m1(outptr0, vl); - vfloat16m1_t _k00 = vle16_v_f16m1(kptr, vl); - vfloat16m1_t _k01 = vle16_v_f16m1(kptr + packn, vl); - vfloat16m1_t _k02 = vle16_v_f16m1(kptr + packn * 2, vl); - vfloat16m1_t _k03 = vle16_v_f16m1(kptr + packn * 3, vl); - vfloat16m1_t _k04 = vle16_v_f16m1(kptr + packn * 4, vl); - vfloat16m1_t _k05 = vle16_v_f16m1(kptr + packn * 5, vl); - vfloat16m1_t _k06 = vle16_v_f16m1(kptr + packn * 6, vl); + vfloat16m1_t _k00 = __riscv_vle16_v_f16m1(kptr, vl); + vfloat16m1_t _k01 = __riscv_vle16_v_f16m1(kptr + packn, vl); + vfloat16m1_t _k02 = __riscv_vle16_v_f16m1(kptr + packn * 2, vl); + vfloat16m1_t _k03 = __riscv_vle16_v_f16m1(kptr + packn * 3, vl); + vfloat16m1_t _k04 = __riscv_vle16_v_f16m1(kptr + packn * 4, vl); + vfloat16m1_t _k05 = __riscv_vle16_v_f16m1(kptr + packn * 5, vl); + vfloat16m1_t _k06 = __riscv_vle16_v_f16m1(kptr + packn * 6, vl); kptr += packn * 7; - _sum0 = vfmacc_vf_f16m1(_sum0, r0[0], _k00, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r0[1], _k01, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r0[2], _k02, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r0[3], _k03, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r0[4], _k04, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r0[5], _k05, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r0[6], _k06, vl); - - vfloat16m1_t _k10 = vle16_v_f16m1(kptr, vl); - vfloat16m1_t _k11 = vle16_v_f16m1(kptr + packn, vl); - vfloat16m1_t _k12 = vle16_v_f16m1(kptr + packn * 2, vl); - vfloat16m1_t _k13 = vle16_v_f16m1(kptr + packn * 3, vl); - vfloat16m1_t _k14 = vle16_v_f16m1(kptr + packn * 4, vl); - vfloat16m1_t _k15 = vle16_v_f16m1(kptr + packn * 5, vl); - vfloat16m1_t _k16 = vle16_v_f16m1(kptr + packn * 6, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r0[0], _k00, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r0[1], _k01, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r0[2], _k02, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r0[3], _k03, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r0[4], _k04, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r0[5], _k05, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r0[6], _k06, vl); + + vfloat16m1_t _k10 = __riscv_vle16_v_f16m1(kptr, vl); + vfloat16m1_t _k11 = __riscv_vle16_v_f16m1(kptr + packn, vl); + vfloat16m1_t _k12 = __riscv_vle16_v_f16m1(kptr + packn * 2, vl); + vfloat16m1_t _k13 = __riscv_vle16_v_f16m1(kptr + packn * 3, vl); + vfloat16m1_t _k14 = __riscv_vle16_v_f16m1(kptr + packn * 4, vl); + vfloat16m1_t _k15 = __riscv_vle16_v_f16m1(kptr + packn * 5, vl); + vfloat16m1_t _k16 = __riscv_vle16_v_f16m1(kptr + packn * 6, vl); kptr += packn * 7; - _sum0 = vfmacc_vf_f16m1(_sum0, r1[0], _k10, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r1[1], _k11, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r1[2], _k12, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r1[3], _k13, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r1[4], _k14, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r1[5], _k15, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r1[6], _k16, vl); - - vfloat16m1_t _k20 = vle16_v_f16m1(kptr, vl); - vfloat16m1_t _k21 = vle16_v_f16m1(kptr + packn, vl); - vfloat16m1_t _k22 = vle16_v_f16m1(kptr + packn * 2, vl); - vfloat16m1_t _k23 = vle16_v_f16m1(kptr + packn * 3, vl); - vfloat16m1_t _k24 = vle16_v_f16m1(kptr + packn * 4, vl); - vfloat16m1_t _k25 = vle16_v_f16m1(kptr + packn * 5, vl); - vfloat16m1_t _k26 = vle16_v_f16m1(kptr + packn * 6, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r1[0], _k10, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r1[1], _k11, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r1[2], _k12, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r1[3], _k13, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r1[4], _k14, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r1[5], _k15, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r1[6], _k16, vl); + + vfloat16m1_t _k20 = __riscv_vle16_v_f16m1(kptr, vl); + vfloat16m1_t _k21 = __riscv_vle16_v_f16m1(kptr + packn, vl); + vfloat16m1_t _k22 = __riscv_vle16_v_f16m1(kptr + packn * 2, vl); + vfloat16m1_t _k23 = __riscv_vle16_v_f16m1(kptr + packn * 3, vl); + vfloat16m1_t _k24 = __riscv_vle16_v_f16m1(kptr + packn * 4, vl); + vfloat16m1_t _k25 = __riscv_vle16_v_f16m1(kptr + packn * 5, vl); + vfloat16m1_t _k26 = __riscv_vle16_v_f16m1(kptr + packn * 6, vl); kptr += packn * 7; - _sum0 = vfmacc_vf_f16m1(_sum0, r2[0], _k20, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r2[1], _k21, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r2[2], _k22, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r2[3], _k23, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r2[4], _k24, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r2[5], _k25, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r2[6], _k26, vl); - - vfloat16m1_t _k30 = vle16_v_f16m1(kptr, vl); - vfloat16m1_t _k31 = vle16_v_f16m1(kptr + packn, vl); - vfloat16m1_t _k32 = vle16_v_f16m1(kptr + packn * 2, vl); - vfloat16m1_t _k33 = vle16_v_f16m1(kptr + packn * 3, vl); - vfloat16m1_t _k34 = vle16_v_f16m1(kptr + packn * 4, vl); - vfloat16m1_t _k35 = vle16_v_f16m1(kptr + packn * 5, vl); - vfloat16m1_t _k36 = vle16_v_f16m1(kptr + packn * 6, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r2[0], _k20, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r2[1], _k21, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r2[2], _k22, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r2[3], _k23, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r2[4], _k24, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r2[5], _k25, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r2[6], _k26, vl); + + vfloat16m1_t _k30 = __riscv_vle16_v_f16m1(kptr, vl); + vfloat16m1_t _k31 = __riscv_vle16_v_f16m1(kptr + packn, vl); + vfloat16m1_t _k32 = __riscv_vle16_v_f16m1(kptr + packn * 2, vl); + vfloat16m1_t _k33 = __riscv_vle16_v_f16m1(kptr + packn * 3, vl); + vfloat16m1_t _k34 = __riscv_vle16_v_f16m1(kptr + packn * 4, vl); + vfloat16m1_t _k35 = __riscv_vle16_v_f16m1(kptr + packn * 5, vl); + vfloat16m1_t _k36 = __riscv_vle16_v_f16m1(kptr + packn * 6, vl); kptr += packn * 7; - _sum0 = vfmacc_vf_f16m1(_sum0, r3[0], _k30, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r3[1], _k31, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r3[2], _k32, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r3[3], _k33, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r3[4], _k34, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r3[5], _k35, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r3[6], _k36, vl); - - vfloat16m1_t _k40 = vle16_v_f16m1(kptr, vl); - vfloat16m1_t _k41 = vle16_v_f16m1(kptr + packn, vl); - vfloat16m1_t _k42 = vle16_v_f16m1(kptr + packn * 2, vl); - vfloat16m1_t _k43 = vle16_v_f16m1(kptr + packn * 3, vl); - vfloat16m1_t _k44 = vle16_v_f16m1(kptr + packn * 4, vl); - vfloat16m1_t _k45 = vle16_v_f16m1(kptr + packn * 5, vl); - vfloat16m1_t _k46 = vle16_v_f16m1(kptr + packn * 6, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r3[0], _k30, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r3[1], _k31, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r3[2], _k32, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r3[3], _k33, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r3[4], _k34, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r3[5], _k35, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r3[6], _k36, vl); + + vfloat16m1_t _k40 = __riscv_vle16_v_f16m1(kptr, vl); + vfloat16m1_t _k41 = __riscv_vle16_v_f16m1(kptr + packn, vl); + vfloat16m1_t _k42 = __riscv_vle16_v_f16m1(kptr + packn * 2, vl); + vfloat16m1_t _k43 = __riscv_vle16_v_f16m1(kptr + packn * 3, vl); + vfloat16m1_t _k44 = __riscv_vle16_v_f16m1(kptr + packn * 4, vl); + vfloat16m1_t _k45 = __riscv_vle16_v_f16m1(kptr + packn * 5, vl); + vfloat16m1_t _k46 = __riscv_vle16_v_f16m1(kptr + packn * 6, vl); kptr += packn * 7; - _sum0 = vfmacc_vf_f16m1(_sum0, r4[0], _k40, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r4[1], _k41, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r4[2], _k42, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r4[3], _k43, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r4[4], _k44, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r4[5], _k45, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r4[6], _k46, vl); - - vfloat16m1_t _k50 = vle16_v_f16m1(kptr, vl); - vfloat16m1_t _k51 = vle16_v_f16m1(kptr + packn, vl); - vfloat16m1_t _k52 = vle16_v_f16m1(kptr + packn * 2, vl); - vfloat16m1_t _k53 = vle16_v_f16m1(kptr + packn * 3, vl); - vfloat16m1_t _k54 = vle16_v_f16m1(kptr + packn * 4, vl); - vfloat16m1_t _k55 = vle16_v_f16m1(kptr + packn * 5, vl); - vfloat16m1_t _k56 = vle16_v_f16m1(kptr + packn * 6, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r4[0], _k40, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r4[1], _k41, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r4[2], _k42, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r4[3], _k43, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r4[4], _k44, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r4[5], _k45, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r4[6], _k46, vl); + + vfloat16m1_t _k50 = __riscv_vle16_v_f16m1(kptr, vl); + vfloat16m1_t _k51 = __riscv_vle16_v_f16m1(kptr + packn, vl); + vfloat16m1_t _k52 = __riscv_vle16_v_f16m1(kptr + packn * 2, vl); + vfloat16m1_t _k53 = __riscv_vle16_v_f16m1(kptr + packn * 3, vl); + vfloat16m1_t _k54 = __riscv_vle16_v_f16m1(kptr + packn * 4, vl); + vfloat16m1_t _k55 = __riscv_vle16_v_f16m1(kptr + packn * 5, vl); + vfloat16m1_t _k56 = __riscv_vle16_v_f16m1(kptr + packn * 6, vl); kptr += packn * 7; - _sum0 = vfmacc_vf_f16m1(_sum0, r5[0], _k50, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r5[1], _k51, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r5[2], _k52, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r5[3], _k53, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r5[4], _k54, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r5[5], _k55, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r5[6], _k56, vl); - - vfloat16m1_t _k60 = vle16_v_f16m1(kptr, vl); - vfloat16m1_t _k61 = vle16_v_f16m1(kptr + packn, vl); - vfloat16m1_t _k62 = vle16_v_f16m1(kptr + packn * 2, vl); - vfloat16m1_t _k63 = vle16_v_f16m1(kptr + packn * 3, vl); - vfloat16m1_t _k64 = vle16_v_f16m1(kptr + packn * 4, vl); - vfloat16m1_t _k65 = vle16_v_f16m1(kptr + packn * 5, vl); - vfloat16m1_t _k66 = vle16_v_f16m1(kptr + packn * 6, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r5[0], _k50, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r5[1], _k51, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r5[2], _k52, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r5[3], _k53, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r5[4], _k54, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r5[5], _k55, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r5[6], _k56, vl); + + vfloat16m1_t _k60 = __riscv_vle16_v_f16m1(kptr, vl); + vfloat16m1_t _k61 = __riscv_vle16_v_f16m1(kptr + packn, vl); + vfloat16m1_t _k62 = __riscv_vle16_v_f16m1(kptr + packn * 2, vl); + vfloat16m1_t _k63 = __riscv_vle16_v_f16m1(kptr + packn * 3, vl); + vfloat16m1_t _k64 = __riscv_vle16_v_f16m1(kptr + packn * 4, vl); + vfloat16m1_t _k65 = __riscv_vle16_v_f16m1(kptr + packn * 5, vl); + vfloat16m1_t _k66 = __riscv_vle16_v_f16m1(kptr + packn * 6, vl); kptr -= packn * 42; - _sum0 = vfmacc_vf_f16m1(_sum0, r6[0], _k60, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r6[1], _k61, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r6[2], _k62, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r6[3], _k63, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r6[4], _k64, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r6[5], _k65, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, r6[6], _k66, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r6[0], _k60, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r6[1], _k61, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r6[2], _k62, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r6[3], _k63, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r6[4], _k64, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r6[5], _k65, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, r6[6], _k66, vl); - vse16_v_f16m1(outptr0, _sum0, vl); + __riscv_vse16_v_f16m1(outptr0, _sum0, vl); outptr0 += packn; diff --git a/src/layer/riscv/convolution_pack1ton.h b/src/layer/riscv/convolution_pack1ton.h index 15eec7badd9..fd223b71327 100644 --- a/src/layer/riscv/convolution_pack1ton.h +++ b/src/layer/riscv/convolution_pack1ton.h @@ -15,7 +15,7 @@ static void convolution_pack1ton_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_pack1ton, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt) { const int packn = csrr_vlenb() / 4; - const size_t vl = vsetvl_e32m1(packn); + const size_t vl = __riscv_vsetvl_e32m1(packn); int w = bottom_blob.w; int channels = bottom_blob.c; @@ -57,11 +57,11 @@ static void convolution_pack1ton_rvv(const Mat& bottom_blob, Mat& top_blob, cons { for (int j = 0; j < outw; j++) { - vfloat32m1_t _sum = vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _sum = __riscv_vfmv_v_f_f32m1(0.f, vl); if (bias_data_ptr) { - _sum = vle32_v_f32m1(bias_data_ptr + p * packn, vl); + _sum = __riscv_vle32_v_f32m1(bias_data_ptr + p * packn, vl); } const float* kptr = (const float*)weight_data_pack1ton + maxk * channels * p * packn; @@ -75,8 +75,8 @@ static void convolution_pack1ton_rvv(const Mat& bottom_blob, Mat& top_blob, cons for (int k = 0; k < maxk; k++) // 29.23 { float val = sptr[space_ofs[k]]; - vfloat32m1_t _w = vle32_v_f32m1(kptr, vl); - _sum = vfmacc_vf_f32m1(_sum, val, _w, vl); + vfloat32m1_t _w = __riscv_vle32_v_f32m1(kptr, vl); + _sum = __riscv_vfmacc_vf_f32m1(_sum, val, _w, vl); kptr += packn; } @@ -84,7 +84,7 @@ static void convolution_pack1ton_rvv(const Mat& bottom_blob, Mat& top_blob, cons _sum = activation_ps(_sum, activation_type, activation_params, vl); - vse32_v_f32m1(outptr + j * packn, _sum, vl); + __riscv_vse32_v_f32m1(outptr + j * packn, _sum, vl); } outptr += outw * packn; diff --git a/src/layer/riscv/convolution_pack1ton_fp16s.h b/src/layer/riscv/convolution_pack1ton_fp16s.h index 6f8c649e632..307e00ad87c 100644 --- a/src/layer/riscv/convolution_pack1ton_fp16s.h +++ b/src/layer/riscv/convolution_pack1ton_fp16s.h @@ -15,7 +15,7 @@ static void convolution_pack1ton_fp16s_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt) { const int packn = csrr_vlenb() / 2; - const size_t vl = vsetvl_e16m1(packn); + const size_t vl = __riscv_vsetvl_e16m1(packn); int w = bottom_blob.w; int channels = bottom_blob.c; @@ -57,11 +57,11 @@ static void convolution_pack1ton_fp16s_rvv(const Mat& bottom_blob, Mat& top_blob { for (int j = 0; j < outw; j++) { - vfloat32m2_t _sum = vfmv_v_f_f32m2(0.f, vl); + vfloat32m2_t _sum = __riscv_vfmv_v_f_f32m2(0.f, vl); if (bias_data_ptr) { - _sum = vle32_v_f32m2(bias_data_ptr + p * packn, vl); + _sum = __riscv_vle32_v_f32m2(bias_data_ptr + p * packn, vl); } const __fp16* kptr = weight_data_fp16.channel(p); @@ -75,8 +75,8 @@ static void convolution_pack1ton_fp16s_rvv(const Mat& bottom_blob, Mat& top_blob for (int k = 0; k < maxk; k++) { float val = (float)sptr[space_ofs[k]]; - vfloat16m1_t _w = vle16_v_f16m1(kptr, vl); - _sum = vfwmacc_vf_f32m2(_sum, val, _w, vl); + vfloat16m1_t _w = __riscv_vle16_v_f16m1(kptr, vl); + _sum = __riscv_vfwmacc_vf_f32m2(_sum, val, _w, vl); kptr += packn; } @@ -84,7 +84,7 @@ static void convolution_pack1ton_fp16s_rvv(const Mat& bottom_blob, Mat& top_blob _sum = activation_ps(_sum, activation_type, activation_params, vl); - vse16_v_f16m1(outptr + j * packn, vfncvt_f_f_w_f16m1(_sum, vl), vl); + __riscv_vse16_v_f16m1(outptr + j * packn, __riscv_vfncvt_f_f_w_f16m1(_sum, vl), vl); } outptr += outw * packn; @@ -95,7 +95,7 @@ static void convolution_pack1ton_fp16s_rvv(const Mat& bottom_blob, Mat& top_blob static void convolution_pack1ton_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data_fp16, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt) { const int packn = csrr_vlenb() / 2; - const size_t vl = vsetvl_e16m1(packn); + const size_t vl = __riscv_vsetvl_e16m1(packn); int w = bottom_blob.w; int channels = bottom_blob.c; @@ -137,11 +137,11 @@ static void convolution_pack1ton_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blo { for (int j = 0; j < outw; j++) { - vfloat16m1_t _sum = vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1(0.f, vl); if (bias_data_ptr) { - _sum = vle16_v_f16m1(bias_data_ptr + p * packn, vl); + _sum = __riscv_vle16_v_f16m1(bias_data_ptr + p * packn, vl); } const __fp16* kptr = weight_data_fp16.channel(p); @@ -155,8 +155,8 @@ static void convolution_pack1ton_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blo for (int k = 0; k < maxk; k++) { __fp16 val = sptr[space_ofs[k]]; - vfloat16m1_t _w = vle16_v_f16m1(kptr, vl); - _sum = vfmacc_vf_f16m1(_sum, val, _w, vl); + vfloat16m1_t _w = __riscv_vle16_v_f16m1(kptr, vl); + _sum = __riscv_vfmacc_vf_f16m1(_sum, val, _w, vl); kptr += packn; } @@ -164,7 +164,7 @@ static void convolution_pack1ton_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blo _sum = activation_ps(_sum, activation_type, activation_params, vl); - vse16_v_f16m1(outptr + j * packn, _sum, vl); + __riscv_vse16_v_f16m1(outptr + j * packn, _sum, vl); } outptr += outw * packn; diff --git a/src/layer/riscv/convolution_packn.h b/src/layer/riscv/convolution_packn.h index 9d18c1d858e..4f89c9dada6 100644 --- a/src/layer/riscv/convolution_packn.h +++ b/src/layer/riscv/convolution_packn.h @@ -15,7 +15,7 @@ static void convolution_packn_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_packn, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt) { const int packn = csrr_vlenb() / 4; - const size_t vl = vsetvl_e32m1(packn); + const size_t vl = __riscv_vsetvl_e32m1(packn); int w = bottom_blob.w; int channels = bottom_blob.c; @@ -56,11 +56,11 @@ static void convolution_packn_rvv(const Mat& bottom_blob, Mat& top_blob, const M { for (int j = 0; j < outw; j++) { - vfloat32m1_t _sum = vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _sum = __riscv_vfmv_v_f_f32m1(0.f, vl); if (bias_data_ptr) { - _sum = vle32_v_f32m1(bias_data_ptr + p * packn, vl); + _sum = __riscv_vle32_v_f32m1(bias_data_ptr + p * packn, vl); } const float* kptr = (const float*)weight_data_packn.channel(p); @@ -78,8 +78,8 @@ static void convolution_packn_rvv(const Mat& bottom_blob, Mat& top_blob, const M for (int l = 0; l < packn; l++) { float val = *slptr++; - vfloat32m1_t _w0 = vle32_v_f32m1(kptr, vl); - _sum = vfmacc_vf_f32m1(_sum, val, _w0, vl); + vfloat32m1_t _w0 = __riscv_vle32_v_f32m1(kptr, vl); + _sum = __riscv_vfmacc_vf_f32m1(_sum, val, _w0, vl); kptr += packn; } @@ -88,7 +88,7 @@ static void convolution_packn_rvv(const Mat& bottom_blob, Mat& top_blob, const M _sum = activation_ps(_sum, activation_type, activation_params, vl); - vse32_v_f32m1(outptr + j * packn, _sum, vl); + __riscv_vse32_v_f32m1(outptr + j * packn, _sum, vl); } outptr += outw * packn; diff --git a/src/layer/riscv/convolution_packn_fp16s.h b/src/layer/riscv/convolution_packn_fp16s.h index 0d36cf207cf..9f5e25c25d6 100644 --- a/src/layer/riscv/convolution_packn_fp16s.h +++ b/src/layer/riscv/convolution_packn_fp16s.h @@ -15,7 +15,7 @@ static void convolution_packn_fp16s_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt) { const int packn = csrr_vlenb() / 2; - const size_t vl = vsetvl_e16m1(packn); + const size_t vl = __riscv_vsetvl_e16m1(packn); int w = bottom_blob.w; int channels = bottom_blob.c; @@ -57,11 +57,11 @@ static void convolution_packn_fp16s_rvv(const Mat& bottom_blob, Mat& top_blob, c { for (int j = 0; j < outw; j++) { - vfloat32m2_t _sum = vfmv_v_f_f32m2(0.f, vl); + vfloat32m2_t _sum = __riscv_vfmv_v_f_f32m2(0.f, vl); if (bias_data_ptr) { - _sum = vle32_v_f32m2(bias_data_ptr + p * packn, vl); + _sum = __riscv_vle32_v_f32m2(bias_data_ptr + p * packn, vl); } const __fp16* kptr = weight_data_fp16.channel(p); @@ -79,11 +79,11 @@ static void convolution_packn_fp16s_rvv(const Mat& bottom_blob, Mat& top_blob, c for (int l = 0; l < packn; l++) { float val = (float)*slptr++; - vfloat16m1_t _w0 = vle16_v_f16m1(kptr, vl); - // _sum = vfwmacc_vf_f32m2(_sum, val, _w0, vl); + vfloat16m1_t _w0 = __riscv_vle16_v_f16m1(kptr, vl); + // _sum = __riscv_vfwmacc_vf_f32m2(_sum, val, _w0, vl); - vfloat32m2_t _qwq = vfwmul_vf_f32m2(_w0, val, vl); - _sum = vfadd_vv_f32m2(_sum, _qwq, vl); + vfloat32m2_t _qwq = __riscv_vfwmul_vf_f32m2(_w0, val, vl); + _sum = __riscv_vfadd_vv_f32m2(_sum, _qwq, vl); kptr += packn; } @@ -92,7 +92,7 @@ static void convolution_packn_fp16s_rvv(const Mat& bottom_blob, Mat& top_blob, c _sum = activation_ps(_sum, activation_type, activation_params, vl); - vse16_v_f16m1(outptr + j * packn, vfncvt_f_f_w_f16m1(_sum, vl), vl); + __riscv_vse16_v_f16m1(outptr + j * packn, __riscv_vfncvt_f_f_w_f16m1(_sum, vl), vl); } outptr += outw * packn; @@ -103,7 +103,7 @@ static void convolution_packn_fp16s_rvv(const Mat& bottom_blob, Mat& top_blob, c static void convolution_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data_fp16, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt) { const int packn = csrr_vlenb() / 2; - const size_t vl = vsetvl_e16m1(packn); + const size_t vl = __riscv_vsetvl_e16m1(packn); int w = bottom_blob.w; int channels = bottom_blob.c; @@ -145,11 +145,11 @@ static void convolution_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, { for (int j = 0; j < outw; j++) { - vfloat16m1_t _sum = vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1(0.f, vl); if (bias_data_ptr) { - _sum = vle16_v_f16m1(bias_data_ptr + p * packn, vl); + _sum = __riscv_vle16_v_f16m1(bias_data_ptr + p * packn, vl); } const __fp16* kptr = weight_data_fp16.channel(p); @@ -167,8 +167,8 @@ static void convolution_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, for (int l = 0; l < packn; l++) { __fp16 val = *slptr++; - vfloat16m1_t _w0 = vle16_v_f16m1(kptr, vl); - _sum = vfmacc_vf_f16m1(_sum, val, _w0, vl); + vfloat16m1_t _w0 = __riscv_vle16_v_f16m1(kptr, vl); + _sum = __riscv_vfmacc_vf_f16m1(_sum, val, _w0, vl); kptr += packn; } @@ -177,7 +177,7 @@ static void convolution_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, _sum = activation_ps(_sum, activation_type, activation_params, vl); - vse16_v_f16m1(outptr + j * packn, _sum, vl); + __riscv_vse16_v_f16m1(outptr + j * packn, _sum, vl); } outptr += outw * packn; diff --git a/src/layer/riscv/convolution_packnto1.h b/src/layer/riscv/convolution_packnto1.h index 4c66116d20e..79c46daaaa9 100644 --- a/src/layer/riscv/convolution_packnto1.h +++ b/src/layer/riscv/convolution_packnto1.h @@ -15,7 +15,7 @@ static void convolution_packnto1_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_packnto1, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt) { const int packn = csrr_vlenb() / 4; - const size_t vl = vsetvl_e32m1(packn); + const size_t vl = __riscv_vsetvl_e32m1(packn); int w = bottom_blob.w; int channels = bottom_blob.c; @@ -64,7 +64,7 @@ static void convolution_packnto1_rvv(const Mat& bottom_blob, Mat& top_blob, cons sum = bias_data_ptr[p]; } - vfloat32m1_t _sum = vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _sum = __riscv_vfmv_v_f_f32m1(0.f, vl); const float* kptr = (const float*)weight_data_packnto1.channel(p); @@ -76,15 +76,15 @@ static void convolution_packnto1_rvv(const Mat& bottom_blob, Mat& top_blob, cons for (int k = 0; k < maxk; k++) { - vfloat32m1_t _val = vle32_v_f32m1(sptr + space_ofs[k] * packn, vl); - vfloat32m1_t _w = vle32_v_f32m1(kptr, vl); - _sum = vfmacc_vv_f32m1(_sum, _val, _w, vl); + vfloat32m1_t _val = __riscv_vle32_v_f32m1(sptr + space_ofs[k] * packn, vl); + vfloat32m1_t _w = __riscv_vle32_v_f32m1(kptr, vl); + _sum = __riscv_vfmacc_vv_f32m1(_sum, _val, _w, vl); kptr += packn; } } - sum = vfmv_f_s_f32m1_f32(vfredusum_vs_f32m1_f32m1(vfloat32m1_t(), _sum, vfmv_s_f_f32m1(vfloat32m1_t(), sum, vl), vl)); + sum = __riscv_vfmv_f_s_f32m1_f32(__riscv_vfredusum_vs_f32m1_f32m1(_sum, __riscv_vfmv_s_f_f32m1(sum, vl), vl)); sum = activation_ss(sum, activation_type, activation_params); diff --git a/src/layer/riscv/convolution_packnto1_fp16s.h b/src/layer/riscv/convolution_packnto1_fp16s.h index 83efd3081f8..87c1ad80709 100644 --- a/src/layer/riscv/convolution_packnto1_fp16s.h +++ b/src/layer/riscv/convolution_packnto1_fp16s.h @@ -15,7 +15,7 @@ static void convolution_packnto1_fp16s_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt) { const int packn = csrr_vlenb() / 2; - const size_t vl = vsetvl_e16m1(packn); + const size_t vl = __riscv_vsetvl_e16m1(packn); int w = bottom_blob.w; int channels = bottom_blob.c; @@ -64,7 +64,7 @@ static void convolution_packnto1_fp16s_rvv(const Mat& bottom_blob, Mat& top_blob sum = bias_data_ptr[p]; } - vfloat32m2_t _sum = vfmv_v_f_f32m2(0.f, vl); + vfloat32m2_t _sum = __riscv_vfmv_v_f_f32m2(0.f, vl); const __fp16* kptr = weight_data_fp16.channel(p); @@ -76,9 +76,9 @@ static void convolution_packnto1_fp16s_rvv(const Mat& bottom_blob, Mat& top_blob for (int k = 0; k < maxk; k++) { - vfloat16m1_t _val = vle16_v_f16m1(sptr + space_ofs[k] * packn, vl); - vfloat16m1_t _w = vle16_v_f16m1(kptr, vl); - _sum = vfwmacc_vv_f32m2(_sum, _val, _w, vl); + vfloat16m1_t _val = __riscv_vle16_v_f16m1(sptr + space_ofs[k] * packn, vl); + vfloat16m1_t _w = __riscv_vle16_v_f16m1(kptr, vl); + _sum = __riscv_vfwmacc_vv_f32m2(_sum, _val, _w, vl); kptr += packn; } @@ -87,13 +87,13 @@ static void convolution_packnto1_fp16s_rvv(const Mat& bottom_blob, Mat& top_blob #if C906 // TODO std::vector ss(packn); - vse32_v_f32m2((float*)ss.data(), _sum, vl); + __riscv_vse32_v_f32m2((float*)ss.data(), _sum, vl); for (int i = 0; i < packn; i++) { sum += ss[i]; } #else - sum = vfmv_f_s_f32m1_f32(vfredusum_vs_f32m2_f32m1(vfloat32m1_t(), _sum, vfmv_s_f_f32m1(vfloat32m1_t(), sum, vl), vl)); + sum = __riscv_vfmv_f_s_f32m1_f32(__riscv_vfredusum_vs_f32m2_f32m1(_sum, __riscv_vfmv_s_f_f32m1(sum, vl), vl)); #endif sum = activation_ss(sum, activation_type, activation_params); @@ -109,7 +109,7 @@ static void convolution_packnto1_fp16s_rvv(const Mat& bottom_blob, Mat& top_blob static void convolution_packnto1_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data_fp16, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt) { const int packn = csrr_vlenb() / 2; - const size_t vl = vsetvl_e16m1(packn); + const size_t vl = __riscv_vsetvl_e16m1(packn); int w = bottom_blob.w; int channels = bottom_blob.c; @@ -158,7 +158,7 @@ static void convolution_packnto1_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blo sum = bias_data_ptr[p]; } - vfloat16m1_t _sum = vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1(0.f, vl); const __fp16* kptr = weight_data_fp16.channel(p); @@ -170,15 +170,15 @@ static void convolution_packnto1_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blo for (int k = 0; k < maxk; k++) { - vfloat16m1_t _val = vle16_v_f16m1(sptr + space_ofs[k] * packn, vl); - vfloat16m1_t _w = vle16_v_f16m1(kptr, vl); - _sum = vfmacc_vv_f16m1(_sum, _val, _w, vl); + vfloat16m1_t _val = __riscv_vle16_v_f16m1(sptr + space_ofs[k] * packn, vl); + vfloat16m1_t _w = __riscv_vle16_v_f16m1(kptr, vl); + _sum = __riscv_vfmacc_vv_f16m1(_sum, _val, _w, vl); kptr += packn; } } - sum = vfmv_f_s_f16m1_f16(vfredusum_vs_f16m1_f16m1(vfloat16m1_t(), _sum, vfmv_s_f_f16m1(vfloat16m1_t(), sum, vl), vl)); + sum = __riscv_vfmv_f_s_f16m1_f16(__riscv_vfredusum_vs_f16m1_f16m1(_sum, __riscv_vfmv_s_f_f16m1(sum, vl), vl)); sum = activation_ss(sum, activation_type, activation_params); diff --git a/src/layer/riscv/convolution_riscv.cpp b/src/layer/riscv/convolution_riscv.cpp index fef27f21967..9e7f5afdab4 100644 --- a/src/layer/riscv/convolution_riscv.cpp +++ b/src/layer/riscv/convolution_riscv.cpp @@ -21,7 +21,6 @@ #if __riscv_vector #include #endif // __riscv_vector - #include "riscv_activation.h" #include "riscv_usability.h" @@ -51,38 +50,20 @@ namespace ncnn { #include "convolution_3x3_packn.h" #include "convolution_3x3_pack1ton.h" #include "convolution_7x7_pack1ton.h" - -#if __riscv_zfh -#include "convolution_fp16s.h" -#include "convolution_packn_fp16s.h" -#include "convolution_pack1ton_fp16s.h" -#include "convolution_packnto1_fp16s.h" - -#include "convolution_sgemm_fp16s.h" -#include "convolution_sgemm_packn_fp16s.h" -#include "convolution_sgemm_pack1ton_fp16s.h" -#include "convolution_sgemm_packnto1_fp16s.h" -#include "convolution_winograd_transform_packn_fp16s.h" -#include "convolution_winograd_dot_packn_fp16s.h" -#include "convolution_1x1_fp16s.h" -#include "convolution_1x1_packn_fp16s.h" -#include "convolution_1x1_pack1ton_fp16s.h" -#include "convolution_1x1_packnto1_fp16s.h" -#include "convolution_3x3_packn_fp16s.h" -#include "convolution_3x3_pack1ton_fp16s.h" -#include "convolution_7x7_pack1ton_fp16s.h" - -#endif #endif // __riscv_vector Convolution_riscv::Convolution_riscv() { #if __riscv_vector support_packing = true; -#if __riscv_zfh - support_fp16_storage = true; -#endif #endif // __riscv_vector +#if NCNN_ZFH +#if __riscv_vector + support_fp16_storage = cpu_support_riscv_zvfh(); +#else + support_fp16_storage = cpu_support_riscv_zfh(); +#endif +#endif activation = 0; } @@ -138,8 +119,8 @@ int Convolution_riscv::create_pipeline(const Option& opt) } #endif -#if __riscv_vector && __riscv_zfh - if (opt.use_fp16_storage) +#if NCNN_ZFH + if (support_fp16_storage && opt.use_fp16_storage) { return create_pipeline_fp16s(opt); } @@ -324,9 +305,9 @@ int Convolution_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opti return 0; } +#if NCNN_ZFH int elembits = bottom_blob.elembits(); -#if __riscv_vector && __riscv_zfh if (opt.use_fp16_storage && elembits == 16) { if (opt.use_fp16_arithmetic) @@ -639,7 +620,7 @@ int Convolution_riscv::forward(const std::vector& bottom_blobs, std::vector return -100; #if NCNN_RVV - if (opt.use_fp16_storage && cpu_support_riscv_v() && cpu_support_riscv_zfh() && weight_data_flattened.elembits() == 16) + if (opt.use_fp16_storage && cpu_support_riscv_zvfh() && weight_data_flattened.elembits() == 16) { Mat weight_data_flattened_fp32; cast_float16_to_float32(weight_data_flattened, weight_data_flattened_fp32, opt); @@ -661,7 +642,7 @@ int Convolution_riscv::forward(const std::vector& bottom_blobs, std::vector return -100; #if NCNN_RVV - if (opt.use_fp16_storage && cpu_support_riscv_v() && cpu_support_riscv_zfh() && bias_data_flattened.elembits() == 16) + if (opt.use_fp16_storage && cpu_support_riscv_zvfh() && bias_data_flattened.elembits() == 16) { Mat bias_data_flattened_fp32; cast_float16_to_float32(bias_data_flattened, bias_data_flattened_fp32, opt); @@ -715,393 +696,4 @@ int Convolution_riscv::forward(const std::vector& bottom_blobs, std::vector return 0; } -#if __riscv_vector && __riscv_zfh -static void convolution_transform_kernel_packed_fp16s_rvv(const Mat& weight_data, Mat& weight_data_tm, int num_input, int num_output, int kernel_w, int kernel_h, int elempack, int out_elempack) -{ - const int maxk = kernel_w * kernel_h; - - // src = kw-kh-inch-outch - // dst = pb-pa-kw-kh-inch/pa-outch/pb - { - Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output); - - weight_data_tm.create(maxk, num_input / elempack, num_output / out_elempack, (size_t)2u * elempack * out_elempack, elempack * out_elempack); - - for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack) - { - __fp16* g00 = weight_data_tm.channel(q / out_elempack); - - for (int p = 0; p + (elempack - 1) < num_input; p += elempack) - { - for (int k = 0; k < maxk; k++) - { - for (int i = 0; i < elempack; i++) - { - for (int j = 0; j < out_elempack; j++) - { - const float* k00 = weight_data_r2.channel(q + j).row(p + i); - - g00[0] = (__fp16)k00[k]; - - g00++; - } - } - } - } - } - } -} - -int Convolution_riscv::create_pipeline_fp16s(const Option& opt) -{ - const int packn = csrr_vlenb() / 2; - - const int maxk = kernel_w * kernel_h; - const int num_input = weight_data_size / maxk / num_output; - - int elempack = 1; - int out_elempack = 1; - - if (opt.use_packing_layout) - { - elempack = num_input % packn == 0 ? packn : 1; - out_elempack = num_output % packn == 0 ? packn : 1; - } - - // packn - if (elempack == packn && out_elempack == packn) - { - if (opt.use_winograd_convolution && (opt.use_winograd23_convolution || opt.use_winograd43_convolution || opt.use_winograd63_convolution) && opt.use_fp16_arithmetic && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) - { - if ((opt.use_winograd63_convolution && num_input >= packn * 2 && num_output >= packn * 2 && num_input <= packn * 16 && num_output <= packn * 16) || (!opt.use_winograd43_convolution && !opt.use_winograd23_convolution)) - conv3x3s1_winograd63_transform_kernel_packn_fp16sa_rvv(weight_data, weight_winograd63_data, num_input, num_output, opt); - else if ((opt.use_winograd43_convolution && num_input >= packn * 2 && num_output >= packn * 2) || (!opt.use_winograd63_convolution && !opt.use_winograd23_convolution)) - conv3x3s1_winograd43_transform_kernel_packn_fp16sa_rvv(weight_data, weight_winograd43_data, num_input, num_output, opt); - else // if (opt.use_winograd23_convolution) - conv3x3s1_winograd23_transform_kernel_packn_fp16sa_rvv(weight_data, weight_winograd23_data, num_input, num_output, opt); - } - else - { - convolution_transform_kernel_packed_fp16s_rvv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); - } - } - - // pack1ton - if (elempack == 1 && out_elempack == packn) - { - convolution_transform_kernel_packed_fp16s_rvv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); - } - - // packnto1 - if (elempack == packn && out_elempack == 1) - { - if (opt.use_fp16_arithmetic && kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) - { - convolution_im2col_sgemm_transform_kernel_packnto1_fp16sa_rvv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h); - } - else if (opt.use_fp16_arithmetic && kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) - { - convolution_im2col_sgemm_transform_kernel_packnto1_fp16sa_rvv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h); - } - else if (opt.use_fp16_arithmetic && opt.use_sgemm_convolution) - { - convolution_im2col_sgemm_transform_kernel_packnto1_fp16sa_rvv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h); - } - else - { - convolution_transform_kernel_packed_fp16s_rvv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); - } - } - - // pack1 - if (elempack == 1 && out_elempack == 1) - { - if (opt.use_fp16_arithmetic && kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) - { - convolution_im2col_sgemm_transform_kernel_fp16sa_rvv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h); - } - else if (opt.use_fp16_arithmetic && opt.use_sgemm_convolution) - { - convolution_im2col_sgemm_transform_kernel_fp16sa_rvv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h); - } - else - { - convolution_transform_kernel_packed_fp16s_rvv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); - } - } - - if (opt.use_fp16_arithmetic) - { - ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt); - } - - if (opt.lightmode) - weight_data.release(); - - return 0; -} - -int Convolution_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const -{ - const int packn = csrr_vlenb() / 2; - - int w = bottom_blob.w; - int h = bottom_blob.h; - size_t elemsize = bottom_blob.elemsize; - int elempack = bottom_blob.elempack; - - // NCNN_LOGE("Convolution forward_fp16s input %d x %d pad = %d %d ksize=%d %d stride=%d %d", w, h, pad_left, pad_top, kernel_w, kernel_h, stride_w, stride_h); - - const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; - const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; - - Mat bottom_blob_bordered; - make_padding(bottom_blob, bottom_blob_bordered, opt); - if (bottom_blob_bordered.empty()) - return -100; - - w = bottom_blob_bordered.w; - h = bottom_blob_bordered.h; - - int outw = (w - kernel_extent_w) / stride_w + 1; - int outh = (h - kernel_extent_h) / stride_h + 1; - int out_elempack = (opt.use_packing_layout && num_output % packn == 0) ? packn : 1; - size_t out_elemsize = elemsize / elempack * out_elempack; - - top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (elempack == packn && out_elempack == packn) - { - { - convolution_packn_fp16s_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); - } - } - - if (elempack == 1 && out_elempack == packn) - { - { - convolution_pack1ton_fp16s_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); - } - } - - if (elempack == packn && out_elempack == 1) - { - { - convolution_packnto1_fp16s_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); - } - } - - if (elempack == 1 && out_elempack == 1) - { - { - convolution_fp16s(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); - } - } - - return 0; -} - -int Convolution_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const -{ - const int packn = csrr_vlenb() / 2; - - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - size_t elemsize = bottom_blob.elemsize; - int elempack = bottom_blob.elempack; - - // NCNN_LOGE("Convolution input %d x %d pad = %d %d ksize=%d %d stride=%d %d", w, h, pad_w, pad_h, kernel_w, kernel_h, stride_w, stride_h); - - const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; - const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; - - Mat bottom_blob_bordered; - make_padding(bottom_blob, bottom_blob_bordered, opt); - if (bottom_blob_bordered.empty()) - return -100; - - w = bottom_blob_bordered.w; - h = bottom_blob_bordered.h; - - int outw = (w - kernel_extent_w) / stride_w + 1; - int outh = (h - kernel_extent_h) / stride_h + 1; - int out_elempack = (opt.use_packing_layout && num_output % packn == 0) ? packn : 1; - size_t out_elemsize = elemsize / elempack * out_elempack; - - top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - const int num_input = channels * elempack; - - if (elempack == packn && out_elempack == packn) - { - if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) - { - conv1x1s1_sgemm_packn_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt); - - if (activation) - { - activation->forward_inplace(top_blob, opt); - } - } - else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) - { - conv1x1s2_sgemm_packn_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt); - - if (activation) - { - activation->forward_inplace(top_blob, opt); - } - } - else if (opt.use_winograd_convolution && (opt.use_winograd23_convolution || opt.use_winograd43_convolution || opt.use_winograd63_convolution) && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) - { - if ((opt.use_winograd63_convolution && num_input >= packn * 2 && num_output >= packn * 2 && num_input <= packn * 16 && num_output <= packn * 16) || (!opt.use_winograd43_convolution && !opt.use_winograd23_convolution)) - conv3x3s1_winograd63_packn_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_winograd63_data, bias_data_fp16, opt); - else if ((opt.use_winograd43_convolution && num_input >= packn * 2 && num_output >= packn * 2) || (!opt.use_winograd63_convolution && !opt.use_winograd23_convolution)) - conv3x3s1_winograd43_packn_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_winograd43_data, bias_data_fp16, opt); - else // if (opt.use_winograd23_convolution) - conv3x3s1_winograd23_packn_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_winograd23_data, bias_data_fp16, opt); - - if (activation) - { - activation->forward_inplace(top_blob, opt); - } - } - else if (opt.use_sgemm_convolution) - { - convolution_im2col_sgemm_packn_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); - - if (activation) - { - activation->forward_inplace(top_blob, opt); - } - } - else - { - convolution_packn_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); - } - } - - if (elempack == 1 && out_elempack == packn) - { - if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) - { - conv1x1s1_sgemm_pack1ton_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt); - - if (activation) - { - activation->forward_inplace(top_blob, opt); - } - } - else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) - { - conv3x3s1_pack1ton_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt); - - if (activation) - { - activation->forward_inplace(top_blob, opt); - } - } - else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) - { - conv3x3s2_pack1ton_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt); - - if (activation) - { - activation->forward_inplace(top_blob, opt); - } - } - else if (kernel_w == 7 && kernel_h == 7 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) - { - conv7x7s2_pack1ton_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt); - - if (activation) - { - activation->forward_inplace(top_blob, opt); - } - } - else if (opt.use_sgemm_convolution) - { - convolution_im2col_sgemm_pack1ton_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); - - if (activation) - { - activation->forward_inplace(top_blob, opt); - } - } - else - { - convolution_pack1ton_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); - } - } - - if (elempack == packn && out_elempack == 1) - { - if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) - { - conv1x1s1_sgemm_packnto1_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt); - - if (activation) - { - activation->forward_inplace(top_blob, opt); - } - } - else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) - { - conv1x1s2_sgemm_packnto1_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt); - - if (activation) - { - activation->forward_inplace(top_blob, opt); - } - } - else if (opt.use_sgemm_convolution) - { - convolution_im2col_sgemm_packnto1_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); - - if (activation) - { - activation->forward_inplace(top_blob, opt); - } - } - else - { - convolution_packnto1_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); - } - } - - if (elempack == 1 && out_elempack == 1) - { - if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) - { - conv1x1s1_sgemm_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt); - - if (activation) - { - activation->forward_inplace(top_blob, opt); - } - } - else if (opt.use_sgemm_convolution) - { - convolution_im2col_sgemm_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); - - if (activation) - { - activation->forward_inplace(top_blob, opt); - } - } - else - { - convolution_fp16s(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); - } - } - - return 0; -} -#endif // __riscv_vector && __riscv_zfh - } // namespace ncnn diff --git a/src/layer/riscv/convolution_riscv.h b/src/layer/riscv/convolution_riscv.h index a4e008c9dd1..4add7108643 100644 --- a/src/layer/riscv/convolution_riscv.h +++ b/src/layer/riscv/convolution_riscv.h @@ -32,7 +32,7 @@ class Convolution_riscv : public Convolution virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; protected: -#if __riscv_vector && __riscv_zfh +#if NCNN_ZFH int create_pipeline_fp16s(const Option& opt); int forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; diff --git a/src/layer/riscv/convolution_riscv_zfh.cpp b/src/layer/riscv/convolution_riscv_zfh.cpp new file mode 100644 index 00000000000..861510f1ee4 --- /dev/null +++ b/src/layer/riscv/convolution_riscv_zfh.cpp @@ -0,0 +1,463 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "convolution_riscv.h" + +#if __riscv_vector +#include +#endif // __riscv_vector +#include "riscv_activation.h" +#include "riscv_usability.h" + +namespace ncnn { + +#if NCNN_ZFH +#include "convolution_fp16s.h" +#include "convolution_sgemm_fp16s.h" +#include "convolution_1x1_fp16s.h" +#if __riscv_zvfh +#include "convolution_packn_fp16s.h" +#include "convolution_pack1ton_fp16s.h" +#include "convolution_packnto1_fp16s.h" + +#include "convolution_sgemm_packn_fp16s.h" +#include "convolution_sgemm_pack1ton_fp16s.h" +#include "convolution_sgemm_packnto1_fp16s.h" +#include "convolution_winograd_transform_packn_fp16s.h" +#include "convolution_winograd_dot_packn_fp16s.h" +#include "convolution_1x1_packn_fp16s.h" +#include "convolution_1x1_pack1ton_fp16s.h" +#include "convolution_1x1_packnto1_fp16s.h" +#include "convolution_3x3_packn_fp16s.h" +#include "convolution_3x3_pack1ton_fp16s.h" +#include "convolution_7x7_pack1ton_fp16s.h" +#endif +#endif // NCNN_ZFH + +#if NCNN_ZFH +static void convolution_transform_kernel_packed_fp16s_rvv(const Mat& weight_data, Mat& weight_data_tm, int num_input, int num_output, int kernel_w, int kernel_h, int elempack, int out_elempack) +{ + const int maxk = kernel_w * kernel_h; + + // src = kw-kh-inch-outch + // dst = pb-pa-kw-kh-inch/pa-outch/pb + { + Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output); + + weight_data_tm.create(maxk, num_input / elempack, num_output / out_elempack, (size_t)2u * elempack * out_elempack, elempack * out_elempack); + + for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack) + { + __fp16* g00 = weight_data_tm.channel(q / out_elempack); + + for (int p = 0; p + (elempack - 1) < num_input; p += elempack) + { + for (int k = 0; k < maxk; k++) + { + for (int i = 0; i < elempack; i++) + { + for (int j = 0; j < out_elempack; j++) + { + const float* k00 = weight_data_r2.channel(q + j).row(p + i); + + g00[0] = (__fp16)k00[k]; + + g00++; + } + } + } + } + } + } +} + +int Convolution_riscv::create_pipeline_fp16s(const Option& opt) +{ +#if __riscv_zvfh + const int packn = csrr_vlenb() / 2; +#endif // __riscv_zvfh + + const int maxk = kernel_w * kernel_h; + const int num_input = weight_data_size / maxk / num_output; + + int elempack = 1; + int out_elempack = 1; + +#if __riscv_zvfh + if (opt.use_packing_layout) + { + elempack = num_input % packn == 0 ? packn : 1; + out_elempack = num_output % packn == 0 ? packn : 1; + } +#endif // __riscv_zvfh + +#if __riscv_zvfh + // packn + if (elempack == packn && out_elempack == packn) + { + if (opt.use_winograd_convolution && (opt.use_winograd23_convolution || opt.use_winograd43_convolution || opt.use_winograd63_convolution) && opt.use_fp16_arithmetic && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + { + if ((opt.use_winograd63_convolution && num_input >= packn * 2 && num_output >= packn * 2 && num_input <= packn * 16 && num_output <= packn * 16) || (!opt.use_winograd43_convolution && !opt.use_winograd23_convolution)) + conv3x3s1_winograd63_transform_kernel_packn_fp16sa_rvv(weight_data, weight_winograd63_data, num_input, num_output, opt); + else if ((opt.use_winograd43_convolution && num_input >= packn * 2 && num_output >= packn * 2) || (!opt.use_winograd63_convolution && !opt.use_winograd23_convolution)) + conv3x3s1_winograd43_transform_kernel_packn_fp16sa_rvv(weight_data, weight_winograd43_data, num_input, num_output, opt); + else // if (opt.use_winograd23_convolution) + conv3x3s1_winograd23_transform_kernel_packn_fp16sa_rvv(weight_data, weight_winograd23_data, num_input, num_output, opt); + } + else + { + convolution_transform_kernel_packed_fp16s_rvv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); + } + } + + // pack1ton + if (elempack == 1 && out_elempack == packn) + { + convolution_transform_kernel_packed_fp16s_rvv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); + } + + // packnto1 + if (elempack == packn && out_elempack == 1) + { + if (opt.use_fp16_arithmetic && kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + { + convolution_im2col_sgemm_transform_kernel_packnto1_fp16sa_rvv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h); + } + else if (opt.use_fp16_arithmetic && kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) + { + convolution_im2col_sgemm_transform_kernel_packnto1_fp16sa_rvv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h); + } + else if (opt.use_fp16_arithmetic && opt.use_sgemm_convolution) + { + convolution_im2col_sgemm_transform_kernel_packnto1_fp16sa_rvv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h); + } + else + { + convolution_transform_kernel_packed_fp16s_rvv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); + } + } +#endif // __riscv_zvfh + + // pack1 + if (elempack == 1 && out_elempack == 1) + { + if (opt.use_fp16_arithmetic && kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + { + convolution_im2col_sgemm_transform_kernel_fp16sa_rvv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h); + } + else if (opt.use_fp16_arithmetic && opt.use_sgemm_convolution) + { + convolution_im2col_sgemm_transform_kernel_fp16sa_rvv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h); + } + else + { + convolution_transform_kernel_packed_fp16s_rvv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); + } + } + + if (opt.use_fp16_arithmetic) + { + ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt); + } + + if (opt.lightmode) + weight_data.release(); + + return 0; +} + +int Convolution_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ +#if __riscv_zvfh + const int packn = csrr_vlenb() / 2; +#endif // __riscv_zvfh + + int w = bottom_blob.w; + int h = bottom_blob.h; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + // NCNN_LOGE("Convolution forward_fp16s input %d x %d pad = %d %d ksize=%d %d stride=%d %d", w, h, pad_left, pad_top, kernel_w, kernel_h, stride_w, stride_h); + + const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; + const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; + + Mat bottom_blob_bordered; + make_padding(bottom_blob, bottom_blob_bordered, opt); + if (bottom_blob_bordered.empty()) + return -100; + + w = bottom_blob_bordered.w; + h = bottom_blob_bordered.h; + + int outw = (w - kernel_extent_w) / stride_w + 1; + int outh = (h - kernel_extent_h) / stride_h + 1; + int out_elempack = 1; +#if __riscv_zvfh + if (opt.use_packing_layout) + { + out_elempack = num_output % packn == 0 ? packn : 1; + } +#endif // __riscv_zvfh + size_t out_elemsize = elemsize / elempack * out_elempack; + + top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + +#if __riscv_zvfh + if (elempack == packn && out_elempack == packn) + { + { + convolution_packn_fp16s_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); + } + } + + if (elempack == 1 && out_elempack == packn) + { + { + convolution_pack1ton_fp16s_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); + } + } + + if (elempack == packn && out_elempack == 1) + { + { + convolution_packnto1_fp16s_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); + } + } +#endif // __riscv_zvfh + + if (elempack == 1 && out_elempack == 1) + { + { + convolution_fp16s(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); + } + } + + return 0; +} + +int Convolution_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ +#if __riscv_zvfh + const int packn = csrr_vlenb() / 2; +#endif // __riscv_zvfh + + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + // NCNN_LOGE("Convolution input %d x %d pad = %d %d ksize=%d %d stride=%d %d", w, h, pad_w, pad_h, kernel_w, kernel_h, stride_w, stride_h); + + const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; + const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; + + Mat bottom_blob_bordered; + make_padding(bottom_blob, bottom_blob_bordered, opt); + if (bottom_blob_bordered.empty()) + return -100; + + w = bottom_blob_bordered.w; + h = bottom_blob_bordered.h; + + int outw = (w - kernel_extent_w) / stride_w + 1; + int outh = (h - kernel_extent_h) / stride_h + 1; + int out_elempack = 1; +#if __riscv_zvfh + if (opt.use_packing_layout) + { + out_elempack = num_output % packn == 0 ? packn : 1; + } +#endif // __riscv_zvfh + size_t out_elemsize = elemsize / elempack * out_elempack; + + top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + const int num_input = channels * elempack; + +#if __riscv_zvfh + if (elempack == packn && out_elempack == packn) + { + if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + { + conv1x1s1_sgemm_packn_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) + { + conv1x1s2_sgemm_packn_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else if (opt.use_winograd_convolution && (opt.use_winograd23_convolution || opt.use_winograd43_convolution || opt.use_winograd63_convolution) && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + { + if ((opt.use_winograd63_convolution && num_input >= packn * 2 && num_output >= packn * 2 && num_input <= packn * 16 && num_output <= packn * 16) || (!opt.use_winograd43_convolution && !opt.use_winograd23_convolution)) + conv3x3s1_winograd63_packn_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_winograd63_data, bias_data_fp16, opt); + else if ((opt.use_winograd43_convolution && num_input >= packn * 2 && num_output >= packn * 2) || (!opt.use_winograd63_convolution && !opt.use_winograd23_convolution)) + conv3x3s1_winograd43_packn_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_winograd43_data, bias_data_fp16, opt); + else // if (opt.use_winograd23_convolution) + conv3x3s1_winograd23_packn_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_winograd23_data, bias_data_fp16, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else if (opt.use_sgemm_convolution) + { + convolution_im2col_sgemm_packn_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else + { + convolution_packn_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); + } + } + + if (elempack == 1 && out_elempack == packn) + { + if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + { + conv1x1s1_sgemm_pack1ton_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + { + conv3x3s1_pack1ton_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) + { + conv3x3s2_pack1ton_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else if (kernel_w == 7 && kernel_h == 7 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) + { + conv7x7s2_pack1ton_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else if (opt.use_sgemm_convolution) + { + convolution_im2col_sgemm_pack1ton_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else + { + convolution_pack1ton_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); + } + } + + if (elempack == packn && out_elempack == 1) + { + if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + { + conv1x1s1_sgemm_packnto1_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) + { + conv1x1s2_sgemm_packnto1_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else if (opt.use_sgemm_convolution) + { + convolution_im2col_sgemm_packnto1_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else + { + convolution_packnto1_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); + } + } +#endif // __riscv_zvfh + + if (elempack == 1 && out_elempack == 1) + { + if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + { + conv1x1s1_sgemm_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else if (opt.use_sgemm_convolution) + { + convolution_im2col_sgemm_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else + { + convolution_fp16s(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); + } + } + + return 0; +} +#endif // NCNN_ZFH + +} // namespace ncnn diff --git a/src/layer/riscv/convolution_sgemm.h b/src/layer/riscv/convolution_sgemm.h index 801b7cc456f..f9034fadfd9 100644 --- a/src/layer/riscv/convolution_sgemm.h +++ b/src/layer/riscv/convolution_sgemm.h @@ -16,7 +16,7 @@ static void im2col_sgemm_rvv(const Mat& bottom_im2col, Mat& top_blob, const Mat& { #if __riscv_vector const int packn = csrr_vlenb() / 4; - const size_t vl = vsetvl_e32m1(packn); + const size_t vl = __riscv_vsetvl_e32m1(packn); #endif // Mat bottom_im2col(size, maxk, inch, 4u, 1, opt.workspace_allocator); @@ -58,7 +58,7 @@ static void im2col_sgemm_rvv(const Mat& bottom_im2col, Mat& top_blob, const Mat& for (int k = 0; k < maxk; k++) { - vse32_v_f32m1(tmpptr, vle32_v_f32m1(img0, vl), vl); + __riscv_vse32_v_f32m1(tmpptr, __riscv_vle32_v_f32m1(img0, vl), vl); img0 += size; tmpptr += packn; } @@ -144,38 +144,38 @@ static void im2col_sgemm_rvv(const Mat& bottom_im2col, Mat& top_blob, const Mat& int nn = inch * maxk; // inch always > 0 - vfloat32m1_t _sum0 = vfmv_v_f_f32m1(biasptr[0], vl); - vfloat32m1_t _sum1 = vfmv_v_f_f32m1(biasptr[1], vl); - vfloat32m1_t _sum2 = vfmv_v_f_f32m1(biasptr[2], vl); - vfloat32m1_t _sum3 = vfmv_v_f_f32m1(biasptr[3], vl); - vfloat32m1_t _sum4 = vfmv_v_f_f32m1(biasptr[4], vl); - vfloat32m1_t _sum5 = vfmv_v_f_f32m1(biasptr[5], vl); - vfloat32m1_t _sum6 = vfmv_v_f_f32m1(biasptr[6], vl); - vfloat32m1_t _sum7 = vfmv_v_f_f32m1(biasptr[7], vl); + vfloat32m1_t _sum0 = __riscv_vfmv_v_f_f32m1(biasptr[0], vl); + vfloat32m1_t _sum1 = __riscv_vfmv_v_f_f32m1(biasptr[1], vl); + vfloat32m1_t _sum2 = __riscv_vfmv_v_f_f32m1(biasptr[2], vl); + vfloat32m1_t _sum3 = __riscv_vfmv_v_f_f32m1(biasptr[3], vl); + vfloat32m1_t _sum4 = __riscv_vfmv_v_f_f32m1(biasptr[4], vl); + vfloat32m1_t _sum5 = __riscv_vfmv_v_f_f32m1(biasptr[5], vl); + vfloat32m1_t _sum6 = __riscv_vfmv_v_f_f32m1(biasptr[6], vl); + vfloat32m1_t _sum7 = __riscv_vfmv_v_f_f32m1(biasptr[7], vl); for (int q = 0; q < nn; q++) { - vfloat32m1_t _val = vle32_v_f32m1(tmpptr, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, kptr[0], _val, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, kptr[1], _val, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, kptr[2], _val, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, kptr[3], _val, vl); - _sum4 = vfmacc_vf_f32m1(_sum4, kptr[4], _val, vl); - _sum5 = vfmacc_vf_f32m1(_sum5, kptr[5], _val, vl); - _sum6 = vfmacc_vf_f32m1(_sum6, kptr[6], _val, vl); - _sum7 = vfmacc_vf_f32m1(_sum7, kptr[7], _val, vl); + vfloat32m1_t _val = __riscv_vle32_v_f32m1(tmpptr, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, kptr[0], _val, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, kptr[1], _val, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, kptr[2], _val, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, kptr[3], _val, vl); + _sum4 = __riscv_vfmacc_vf_f32m1(_sum4, kptr[4], _val, vl); + _sum5 = __riscv_vfmacc_vf_f32m1(_sum5, kptr[5], _val, vl); + _sum6 = __riscv_vfmacc_vf_f32m1(_sum6, kptr[6], _val, vl); + _sum7 = __riscv_vfmacc_vf_f32m1(_sum7, kptr[7], _val, vl); tmpptr += packn; kptr += 8; } - vse32_v_f32m1(outptr0, _sum0, vl); - vse32_v_f32m1(outptr1, _sum1, vl); - vse32_v_f32m1(outptr2, _sum2, vl); - vse32_v_f32m1(outptr3, _sum3, vl); - vse32_v_f32m1(outptr4, _sum4, vl); - vse32_v_f32m1(outptr5, _sum5, vl); - vse32_v_f32m1(outptr6, _sum6, vl); - vse32_v_f32m1(outptr7, _sum7, vl); + __riscv_vse32_v_f32m1(outptr0, _sum0, vl); + __riscv_vse32_v_f32m1(outptr1, _sum1, vl); + __riscv_vse32_v_f32m1(outptr2, _sum2, vl); + __riscv_vse32_v_f32m1(outptr3, _sum3, vl); + __riscv_vse32_v_f32m1(outptr4, _sum4, vl); + __riscv_vse32_v_f32m1(outptr5, _sum5, vl); + __riscv_vse32_v_f32m1(outptr6, _sum6, vl); + __riscv_vse32_v_f32m1(outptr7, _sum7, vl); outptr0 += packn; outptr1 += packn; @@ -259,26 +259,26 @@ static void im2col_sgemm_rvv(const Mat& bottom_im2col, Mat& top_blob, const Mat& int nn = inch * maxk; // inch always > 0 - vfloat32m1_t _sum0 = vfmv_v_f_f32m1(biasptr[0], vl); - vfloat32m1_t _sum1 = vfmv_v_f_f32m1(biasptr[1], vl); - vfloat32m1_t _sum2 = vfmv_v_f_f32m1(biasptr[2], vl); - vfloat32m1_t _sum3 = vfmv_v_f_f32m1(biasptr[3], vl); + vfloat32m1_t _sum0 = __riscv_vfmv_v_f_f32m1(biasptr[0], vl); + vfloat32m1_t _sum1 = __riscv_vfmv_v_f_f32m1(biasptr[1], vl); + vfloat32m1_t _sum2 = __riscv_vfmv_v_f_f32m1(biasptr[2], vl); + vfloat32m1_t _sum3 = __riscv_vfmv_v_f_f32m1(biasptr[3], vl); for (int q = 0; q < nn; q++) { - vfloat32m1_t _val = vle32_v_f32m1(tmpptr, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, kptr[0], _val, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, kptr[1], _val, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, kptr[2], _val, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, kptr[3], _val, vl); + vfloat32m1_t _val = __riscv_vle32_v_f32m1(tmpptr, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, kptr[0], _val, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, kptr[1], _val, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, kptr[2], _val, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, kptr[3], _val, vl); tmpptr += packn; kptr += 4; } - vse32_v_f32m1(outptr0, _sum0, vl); - vse32_v_f32m1(outptr1, _sum1, vl); - vse32_v_f32m1(outptr2, _sum2, vl); - vse32_v_f32m1(outptr3, _sum3, vl); + __riscv_vse32_v_f32m1(outptr0, _sum0, vl); + __riscv_vse32_v_f32m1(outptr1, _sum1, vl); + __riscv_vse32_v_f32m1(outptr2, _sum2, vl); + __riscv_vse32_v_f32m1(outptr3, _sum3, vl); outptr0 += packn; outptr1 += packn; @@ -423,16 +423,16 @@ static void im2col_sgemm_rvv(const Mat& bottom_im2col, Mat& top_blob, const Mat& int nn = inch * maxk; // inch always > 0 - vfloat32m1_t _sum0 = vfmv_v_f_f32m1(bias0, vl); + vfloat32m1_t _sum0 = __riscv_vfmv_v_f_f32m1(bias0, vl); for (int q = 0; q < nn; q++) { - _sum0 = vfmacc_vf_f32m1(_sum0, kptr[0], vle32_v_f32m1(tmpptr, vl), vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, kptr[0], __riscv_vle32_v_f32m1(tmpptr, vl), vl); tmpptr += packn; kptr++; } - vse32_v_f32m1(outptr0, _sum0, vl); + __riscv_vse32_v_f32m1(outptr0, _sum0, vl); outptr0 += packn; } diff --git a/src/layer/riscv/convolution_sgemm_fp16s.h b/src/layer/riscv/convolution_sgemm_fp16s.h index 72a621641db..6354b3836a4 100644 --- a/src/layer/riscv/convolution_sgemm_fp16s.h +++ b/src/layer/riscv/convolution_sgemm_fp16s.h @@ -16,7 +16,7 @@ static void im2col_sgemm_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blob, con { #if __riscv_vector const int packn = csrr_vlenb() / 2; - const size_t vl = vsetvl_e16m1(packn); + const size_t vl = __riscv_vsetvl_e16m1(packn); #endif // Mat bottom_im2col(size, maxk, inch, 2u, 1, opt.workspace_allocator); @@ -52,7 +52,7 @@ static void im2col_sgemm_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blob, con for (int k = 0; k < maxk; k++) { - vse16_v_f16m1(tmpptr, vle16_v_f16m1(img0, vl), vl); + __riscv_vse16_v_f16m1(tmpptr, __riscv_vle16_v_f16m1(img0, vl), vl); img0 += size; tmpptr += packn; } @@ -131,38 +131,38 @@ static void im2col_sgemm_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blob, con int nn = inch * maxk; // inch always > 0 - vfloat16m1_t _sum0 = vfmv_v_f_f16m1(biasptr[0], vl); - vfloat16m1_t _sum1 = vfmv_v_f_f16m1(biasptr[1], vl); - vfloat16m1_t _sum2 = vfmv_v_f_f16m1(biasptr[2], vl); - vfloat16m1_t _sum3 = vfmv_v_f_f16m1(biasptr[3], vl); - vfloat16m1_t _sum4 = vfmv_v_f_f16m1(biasptr[4], vl); - vfloat16m1_t _sum5 = vfmv_v_f_f16m1(biasptr[5], vl); - vfloat16m1_t _sum6 = vfmv_v_f_f16m1(biasptr[6], vl); - vfloat16m1_t _sum7 = vfmv_v_f_f16m1(biasptr[7], vl); + vfloat16m1_t _sum0 = __riscv_vfmv_v_f_f16m1(biasptr[0], vl); + vfloat16m1_t _sum1 = __riscv_vfmv_v_f_f16m1(biasptr[1], vl); + vfloat16m1_t _sum2 = __riscv_vfmv_v_f_f16m1(biasptr[2], vl); + vfloat16m1_t _sum3 = __riscv_vfmv_v_f_f16m1(biasptr[3], vl); + vfloat16m1_t _sum4 = __riscv_vfmv_v_f_f16m1(biasptr[4], vl); + vfloat16m1_t _sum5 = __riscv_vfmv_v_f_f16m1(biasptr[5], vl); + vfloat16m1_t _sum6 = __riscv_vfmv_v_f_f16m1(biasptr[6], vl); + vfloat16m1_t _sum7 = __riscv_vfmv_v_f_f16m1(biasptr[7], vl); for (int q = 0; q < nn; q++) { - vfloat16m1_t _val = vle16_v_f16m1(tmpptr, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, kptr[0], _val, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, kptr[1], _val, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, kptr[2], _val, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, kptr[3], _val, vl); - _sum4 = vfmacc_vf_f16m1(_sum4, kptr[4], _val, vl); - _sum5 = vfmacc_vf_f16m1(_sum5, kptr[5], _val, vl); - _sum6 = vfmacc_vf_f16m1(_sum6, kptr[6], _val, vl); - _sum7 = vfmacc_vf_f16m1(_sum7, kptr[7], _val, vl); + vfloat16m1_t _val = __riscv_vle16_v_f16m1(tmpptr, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, kptr[0], _val, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, kptr[1], _val, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, kptr[2], _val, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, kptr[3], _val, vl); + _sum4 = __riscv_vfmacc_vf_f16m1(_sum4, kptr[4], _val, vl); + _sum5 = __riscv_vfmacc_vf_f16m1(_sum5, kptr[5], _val, vl); + _sum6 = __riscv_vfmacc_vf_f16m1(_sum6, kptr[6], _val, vl); + _sum7 = __riscv_vfmacc_vf_f16m1(_sum7, kptr[7], _val, vl); tmpptr += packn; kptr += 8; } - vse16_v_f16m1(outptr0, _sum0, vl); - vse16_v_f16m1(outptr1, _sum1, vl); - vse16_v_f16m1(outptr2, _sum2, vl); - vse16_v_f16m1(outptr3, _sum3, vl); - vse16_v_f16m1(outptr4, _sum4, vl); - vse16_v_f16m1(outptr5, _sum5, vl); - vse16_v_f16m1(outptr6, _sum6, vl); - vse16_v_f16m1(outptr7, _sum7, vl); + __riscv_vse16_v_f16m1(outptr0, _sum0, vl); + __riscv_vse16_v_f16m1(outptr1, _sum1, vl); + __riscv_vse16_v_f16m1(outptr2, _sum2, vl); + __riscv_vse16_v_f16m1(outptr3, _sum3, vl); + __riscv_vse16_v_f16m1(outptr4, _sum4, vl); + __riscv_vse16_v_f16m1(outptr5, _sum5, vl); + __riscv_vse16_v_f16m1(outptr6, _sum6, vl); + __riscv_vse16_v_f16m1(outptr7, _sum7, vl); outptr0 += packn; outptr1 += packn; @@ -246,26 +246,26 @@ static void im2col_sgemm_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blob, con int nn = inch * maxk; // inch always > 0 - vfloat16m1_t _sum0 = vfmv_v_f_f16m1(biasptr[0], vl); - vfloat16m1_t _sum1 = vfmv_v_f_f16m1(biasptr[1], vl); - vfloat16m1_t _sum2 = vfmv_v_f_f16m1(biasptr[2], vl); - vfloat16m1_t _sum3 = vfmv_v_f_f16m1(biasptr[3], vl); + vfloat16m1_t _sum0 = __riscv_vfmv_v_f_f16m1(biasptr[0], vl); + vfloat16m1_t _sum1 = __riscv_vfmv_v_f_f16m1(biasptr[1], vl); + vfloat16m1_t _sum2 = __riscv_vfmv_v_f_f16m1(biasptr[2], vl); + vfloat16m1_t _sum3 = __riscv_vfmv_v_f_f16m1(biasptr[3], vl); for (int q = 0; q < nn; q++) { - vfloat16m1_t _val = vle16_v_f16m1(tmpptr, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, kptr[0], _val, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, kptr[1], _val, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, kptr[2], _val, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, kptr[3], _val, vl); + vfloat16m1_t _val = __riscv_vle16_v_f16m1(tmpptr, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, kptr[0], _val, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, kptr[1], _val, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, kptr[2], _val, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, kptr[3], _val, vl); tmpptr += packn; kptr += 4; } - vse16_v_f16m1(outptr0, _sum0, vl); - vse16_v_f16m1(outptr1, _sum1, vl); - vse16_v_f16m1(outptr2, _sum2, vl); - vse16_v_f16m1(outptr3, _sum3, vl); + __riscv_vse16_v_f16m1(outptr0, _sum0, vl); + __riscv_vse16_v_f16m1(outptr1, _sum1, vl); + __riscv_vse16_v_f16m1(outptr2, _sum2, vl); + __riscv_vse16_v_f16m1(outptr3, _sum3, vl); outptr0 += packn; outptr1 += packn; @@ -323,16 +323,16 @@ static void im2col_sgemm_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blob, con int nn = inch * maxk; // inch always > 0 - vfloat16m1_t _sum0 = vfmv_v_f_f16m1(bias0, vl); + vfloat16m1_t _sum0 = __riscv_vfmv_v_f_f16m1(bias0, vl); for (int q = 0; q < nn; q++) { - _sum0 = vfmacc_vf_f16m1(_sum0, kptr[0], vle16_v_f16m1(tmpptr, vl), vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, kptr[0], __riscv_vle16_v_f16m1(tmpptr, vl), vl); tmpptr += packn; kptr++; } - vse16_v_f16m1(outptr0, _sum0, vl); + __riscv_vse16_v_f16m1(outptr0, _sum0, vl); outptr0 += packn; } diff --git a/src/layer/riscv/convolution_sgemm_pack1ton.h b/src/layer/riscv/convolution_sgemm_pack1ton.h index 8a3e6ffbc43..c230be6c687 100644 --- a/src/layer/riscv/convolution_sgemm_pack1ton.h +++ b/src/layer/riscv/convolution_sgemm_pack1ton.h @@ -15,7 +15,7 @@ static void im2col_sgemm_pack1ton_rvv(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) { const int packn = csrr_vlenb() / 4; - const size_t vl = vsetvl_e32m1(packn); + const size_t vl = __riscv_vsetvl_e32m1(packn); // Mat bottom_im2col(size, maxk, inch, 4u, 1, opt.workspace_allocator); @@ -63,23 +63,23 @@ static void im2col_sgemm_pack1ton_rvv(const Mat& bottom_im2col, Mat& top_blob, c int nn = inch * maxk; // inch always > 0 - vfloat32m1_t _sum = vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _sum = __riscv_vfmv_v_f_f32m1(0.f, vl); if (bias) { - _sum = vle32_v_f32m1(bias + p * packn, vl); + _sum = __riscv_vle32_v_f32m1(bias + p * packn, vl); } for (int j = 0; j < nn; j++) { float val = *tmpptr++; - vfloat32m1_t _w0 = vle32_v_f32m1(kptr0, vl); - _sum = vfmacc_vf_f32m1(_sum, val, _w0, vl); + vfloat32m1_t _w0 = __riscv_vle32_v_f32m1(kptr0, vl); + _sum = __riscv_vfmacc_vf_f32m1(_sum, val, _w0, vl); kptr0 += packn; } - vse32_v_f32m1(outptr0, _sum, vl); + __riscv_vse32_v_f32m1(outptr0, _sum, vl); outptr0 += packn; } diff --git a/src/layer/riscv/convolution_sgemm_pack1ton_fp16s.h b/src/layer/riscv/convolution_sgemm_pack1ton_fp16s.h index 0c0b2791a8f..07f32cd3472 100644 --- a/src/layer/riscv/convolution_sgemm_pack1ton_fp16s.h +++ b/src/layer/riscv/convolution_sgemm_pack1ton_fp16s.h @@ -15,7 +15,7 @@ static void im2col_sgemm_pack1ton_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) { const int packn = csrr_vlenb() / 2; - const size_t vl = vsetvl_e16m1(packn); + const size_t vl = __riscv_vsetvl_e16m1(packn); // Mat bottom_im2col(size, maxk, inch, 2u, 1, opt.workspace_allocator); @@ -63,23 +63,23 @@ static void im2col_sgemm_pack1ton_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_ int nn = inch * maxk; // inch always > 0 - vfloat16m1_t _sum = vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1(0.f, vl); if (bias) { - _sum = vle16_v_f16m1(bias + p * packn, vl); + _sum = __riscv_vle16_v_f16m1(bias + p * packn, vl); } for (int j = 0; j < nn; j++) { __fp16 val = *tmpptr++; - vfloat16m1_t _w0 = vle16_v_f16m1(kptr0, vl); - _sum = vfmacc_vf_f16m1(_sum, val, _w0, vl); + vfloat16m1_t _w0 = __riscv_vle16_v_f16m1(kptr0, vl); + _sum = __riscv_vfmacc_vf_f16m1(_sum, val, _w0, vl); kptr0 += packn; } - vse16_v_f16m1(outptr0, _sum, vl); + __riscv_vse16_v_f16m1(outptr0, _sum, vl); outptr0 += packn; } diff --git a/src/layer/riscv/convolution_sgemm_packn.h b/src/layer/riscv/convolution_sgemm_packn.h index 9255c092ae4..d955a1b0193 100644 --- a/src/layer/riscv/convolution_sgemm_packn.h +++ b/src/layer/riscv/convolution_sgemm_packn.h @@ -15,7 +15,7 @@ static void im2col_sgemm_packn_rvv(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) { const int packn = csrr_vlenb() / 4; - const size_t vl = vsetvl_e32m1(packn); + const size_t vl = __riscv_vsetvl_e32m1(packn); // Mat bottom_im2col(size, maxk, inch, 4u * packn, packn, opt.workspace_allocator); @@ -70,15 +70,15 @@ static void im2col_sgemm_packn_rvv(const Mat& bottom_im2col, Mat& top_blob, cons img0 += size * packn; #else - vfloat32m1_t _val0 = vle32_v_f32m1(img0, vl); - vfloat32m1_t _val1 = vle32_v_f32m1(img0 + packn, vl); - vfloat32m1_t _val2 = vle32_v_f32m1(img0 + packn * 2, vl); - vfloat32m1_t _val3 = vle32_v_f32m1(img0 + packn * 3, vl); - vfloat32m1_t _val4 = vle32_v_f32m1(img0 + packn * 4, vl); - vfloat32m1_t _val5 = vle32_v_f32m1(img0 + packn * 5, vl); - vfloat32m1_t _val6 = vle32_v_f32m1(img0 + packn * 6, vl); - vfloat32m1_t _val7 = vle32_v_f32m1(img0 + packn * 7, vl); - vsseg8e32_v_f32m1(tmpptr, _val0, _val1, _val2, _val3, _val4, _val5, _val6, _val7, vl); + vfloat32m1_t _val0 = __riscv_vle32_v_f32m1(img0, vl); + vfloat32m1_t _val1 = __riscv_vle32_v_f32m1(img0 + packn, vl); + vfloat32m1_t _val2 = __riscv_vle32_v_f32m1(img0 + packn * 2, vl); + vfloat32m1_t _val3 = __riscv_vle32_v_f32m1(img0 + packn * 3, vl); + vfloat32m1_t _val4 = __riscv_vle32_v_f32m1(img0 + packn * 4, vl); + vfloat32m1_t _val5 = __riscv_vle32_v_f32m1(img0 + packn * 5, vl); + vfloat32m1_t _val6 = __riscv_vle32_v_f32m1(img0 + packn * 6, vl); + vfloat32m1_t _val7 = __riscv_vle32_v_f32m1(img0 + packn * 7, vl); + __riscv_vsseg8e32_v_f32m1x8(tmpptr, __riscv_vcreate_v_f32m1x8(_val0, _val1, _val2, _val3, _val4, _val5, _val6, _val7), vl); img0 += size * packn; tmpptr += packn * 8; @@ -115,11 +115,11 @@ static void im2col_sgemm_packn_rvv(const Mat& bottom_im2col, Mat& top_blob, cons img0 += size * packn; #else - vfloat32m1_t _val0 = vle32_v_f32m1(img0, vl); - vfloat32m1_t _val1 = vle32_v_f32m1(img0 + packn, vl); - vfloat32m1_t _val2 = vle32_v_f32m1(img0 + packn * 2, vl); - vfloat32m1_t _val3 = vle32_v_f32m1(img0 + packn * 3, vl); - vsseg4e32_v_f32m1(tmpptr, _val0, _val1, _val2, _val3, vl); + vfloat32m1_t _val0 = __riscv_vle32_v_f32m1(img0, vl); + vfloat32m1_t _val1 = __riscv_vle32_v_f32m1(img0 + packn, vl); + vfloat32m1_t _val2 = __riscv_vle32_v_f32m1(img0 + packn * 2, vl); + vfloat32m1_t _val3 = __riscv_vle32_v_f32m1(img0 + packn * 3, vl); + __riscv_vsseg4e32_v_f32m1x4(tmpptr, __riscv_vcreate_v_f32m1x4(_val0, _val1, _val2, _val3), vl); img0 += size * packn; tmpptr += packn * 4; @@ -154,9 +154,9 @@ static void im2col_sgemm_packn_rvv(const Mat& bottom_im2col, Mat& top_blob, cons img0 += size * packn; #else - vfloat32m1_t _val0 = vle32_v_f32m1(img0, vl); - vfloat32m1_t _val1 = vle32_v_f32m1(img0 + packn, vl); - vsseg2e32_v_f32m1(tmpptr, _val0, _val1, vl); + vfloat32m1_t _val0 = __riscv_vle32_v_f32m1(img0, vl); + vfloat32m1_t _val1 = __riscv_vle32_v_f32m1(img0 + packn, vl); + __riscv_vsseg2e32_v_f32m1x2(tmpptr, __riscv_vcreate_v_f32m1x2(_val0, _val1), vl); img0 += size * packn; tmpptr += packn * 2; @@ -178,8 +178,8 @@ static void im2col_sgemm_packn_rvv(const Mat& bottom_im2col, Mat& top_blob, cons for (int k = 0; k < maxk; k++) { - vfloat32m1_t _val = vle32_v_f32m1(img0, vl); - vse32_v_f32m1(tmpptr, _val, vl); + vfloat32m1_t _val = __riscv_vle32_v_f32m1(img0, vl); + __riscv_vse32_v_f32m1(tmpptr, _val, vl); img0 += size * packn; tmpptr += packn; @@ -201,25 +201,25 @@ static void im2col_sgemm_packn_rvv(const Mat& bottom_im2col, Mat& top_blob, cons int nn = inch * maxk * packn; // inch always > 0 - vfloat32m1_t _sum0 = vfmv_v_f_f32m1(0.f, vl); - vfloat32m1_t _sum1 = vfmv_v_f_f32m1(0.f, vl); - vfloat32m1_t _sum2 = vfmv_v_f_f32m1(0.f, vl); - vfloat32m1_t _sum3 = vfmv_v_f_f32m1(0.f, vl); - vfloat32m1_t _sum4 = vfmv_v_f_f32m1(0.f, vl); - vfloat32m1_t _sum5 = vfmv_v_f_f32m1(0.f, vl); - vfloat32m1_t _sum6 = vfmv_v_f_f32m1(0.f, vl); - vfloat32m1_t _sum7 = vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _sum0 = __riscv_vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _sum1 = __riscv_vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _sum2 = __riscv_vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _sum3 = __riscv_vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _sum4 = __riscv_vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _sum5 = __riscv_vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _sum6 = __riscv_vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _sum7 = __riscv_vfmv_v_f_f32m1(0.f, vl); if (bias) { - _sum0 = vle32_v_f32m1(bias + p * packn, vl); - _sum1 = vle32_v_f32m1(bias + p * packn, vl); - _sum2 = vle32_v_f32m1(bias + p * packn, vl); - _sum3 = vle32_v_f32m1(bias + p * packn, vl); - _sum4 = vle32_v_f32m1(bias + p * packn, vl); - _sum5 = vle32_v_f32m1(bias + p * packn, vl); - _sum6 = vle32_v_f32m1(bias + p * packn, vl); - _sum7 = vle32_v_f32m1(bias + p * packn, vl); + _sum0 = __riscv_vle32_v_f32m1(bias + p * packn, vl); + _sum1 = __riscv_vle32_v_f32m1(bias + p * packn, vl); + _sum2 = __riscv_vle32_v_f32m1(bias + p * packn, vl); + _sum3 = __riscv_vle32_v_f32m1(bias + p * packn, vl); + _sum4 = __riscv_vle32_v_f32m1(bias + p * packn, vl); + _sum5 = __riscv_vle32_v_f32m1(bias + p * packn, vl); + _sum6 = __riscv_vle32_v_f32m1(bias + p * packn, vl); + _sum7 = __riscv_vle32_v_f32m1(bias + p * packn, vl); } for (int j = 0; j < nn; j++) @@ -232,27 +232,27 @@ static void im2col_sgemm_packn_rvv(const Mat& bottom_im2col, Mat& top_blob, cons float val5 = *tmpptr++; float val6 = *tmpptr++; float val7 = *tmpptr++; - vfloat32m1_t _w0 = vle32_v_f32m1(kptr0, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, val0, _w0, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, val1, _w0, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, val2, _w0, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, val3, _w0, vl); - _sum4 = vfmacc_vf_f32m1(_sum4, val4, _w0, vl); - _sum5 = vfmacc_vf_f32m1(_sum5, val5, _w0, vl); - _sum6 = vfmacc_vf_f32m1(_sum6, val6, _w0, vl); - _sum7 = vfmacc_vf_f32m1(_sum7, val7, _w0, vl); + vfloat32m1_t _w0 = __riscv_vle32_v_f32m1(kptr0, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, val0, _w0, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, val1, _w0, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, val2, _w0, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, val3, _w0, vl); + _sum4 = __riscv_vfmacc_vf_f32m1(_sum4, val4, _w0, vl); + _sum5 = __riscv_vfmacc_vf_f32m1(_sum5, val5, _w0, vl); + _sum6 = __riscv_vfmacc_vf_f32m1(_sum6, val6, _w0, vl); + _sum7 = __riscv_vfmacc_vf_f32m1(_sum7, val7, _w0, vl); kptr0 += packn; } - vse32_v_f32m1(outptr0, _sum0, vl); - vse32_v_f32m1(outptr0 + packn, _sum1, vl); - vse32_v_f32m1(outptr0 + packn * 2, _sum2, vl); - vse32_v_f32m1(outptr0 + packn * 3, _sum3, vl); - vse32_v_f32m1(outptr0 + packn * 4, _sum4, vl); - vse32_v_f32m1(outptr0 + packn * 5, _sum5, vl); - vse32_v_f32m1(outptr0 + packn * 6, _sum6, vl); - vse32_v_f32m1(outptr0 + packn * 7, _sum7, vl); + __riscv_vse32_v_f32m1(outptr0, _sum0, vl); + __riscv_vse32_v_f32m1(outptr0 + packn, _sum1, vl); + __riscv_vse32_v_f32m1(outptr0 + packn * 2, _sum2, vl); + __riscv_vse32_v_f32m1(outptr0 + packn * 3, _sum3, vl); + __riscv_vse32_v_f32m1(outptr0 + packn * 4, _sum4, vl); + __riscv_vse32_v_f32m1(outptr0 + packn * 5, _sum5, vl); + __riscv_vse32_v_f32m1(outptr0 + packn * 6, _sum6, vl); + __riscv_vse32_v_f32m1(outptr0 + packn * 7, _sum7, vl); outptr0 += packn * 8; } @@ -263,17 +263,17 @@ static void im2col_sgemm_packn_rvv(const Mat& bottom_im2col, Mat& top_blob, cons int nn = inch * maxk * packn; // inch always > 0 - vfloat32m1_t _sum0 = vfmv_v_f_f32m1(0.f, vl); - vfloat32m1_t _sum1 = vfmv_v_f_f32m1(0.f, vl); - vfloat32m1_t _sum2 = vfmv_v_f_f32m1(0.f, vl); - vfloat32m1_t _sum3 = vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _sum0 = __riscv_vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _sum1 = __riscv_vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _sum2 = __riscv_vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _sum3 = __riscv_vfmv_v_f_f32m1(0.f, vl); if (bias) { - _sum0 = vle32_v_f32m1(bias + p * packn, vl); - _sum1 = vle32_v_f32m1(bias + p * packn, vl); - _sum2 = vle32_v_f32m1(bias + p * packn, vl); - _sum3 = vle32_v_f32m1(bias + p * packn, vl); + _sum0 = __riscv_vle32_v_f32m1(bias + p * packn, vl); + _sum1 = __riscv_vle32_v_f32m1(bias + p * packn, vl); + _sum2 = __riscv_vle32_v_f32m1(bias + p * packn, vl); + _sum3 = __riscv_vle32_v_f32m1(bias + p * packn, vl); } for (int j = 0; j < nn; j++) @@ -282,19 +282,19 @@ static void im2col_sgemm_packn_rvv(const Mat& bottom_im2col, Mat& top_blob, cons float val1 = *tmpptr++; float val2 = *tmpptr++; float val3 = *tmpptr++; - vfloat32m1_t _w0 = vle32_v_f32m1(kptr0, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, val0, _w0, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, val1, _w0, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, val2, _w0, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, val3, _w0, vl); + vfloat32m1_t _w0 = __riscv_vle32_v_f32m1(kptr0, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, val0, _w0, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, val1, _w0, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, val2, _w0, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, val3, _w0, vl); kptr0 += packn; } - vse32_v_f32m1(outptr0, _sum0, vl); - vse32_v_f32m1(outptr0 + packn, _sum1, vl); - vse32_v_f32m1(outptr0 + packn * 2, _sum2, vl); - vse32_v_f32m1(outptr0 + packn * 3, _sum3, vl); + __riscv_vse32_v_f32m1(outptr0, _sum0, vl); + __riscv_vse32_v_f32m1(outptr0 + packn, _sum1, vl); + __riscv_vse32_v_f32m1(outptr0 + packn * 2, _sum2, vl); + __riscv_vse32_v_f32m1(outptr0 + packn * 3, _sum3, vl); outptr0 += packn * 4; } @@ -305,28 +305,28 @@ static void im2col_sgemm_packn_rvv(const Mat& bottom_im2col, Mat& top_blob, cons int nn = inch * maxk * packn; // inch always > 0 - vfloat32m1_t _sum0 = vfmv_v_f_f32m1(0.f, vl); - vfloat32m1_t _sum1 = vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _sum0 = __riscv_vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _sum1 = __riscv_vfmv_v_f_f32m1(0.f, vl); if (bias) { - _sum0 = vle32_v_f32m1(bias + p * packn, vl); - _sum1 = vle32_v_f32m1(bias + p * packn, vl); + _sum0 = __riscv_vle32_v_f32m1(bias + p * packn, vl); + _sum1 = __riscv_vle32_v_f32m1(bias + p * packn, vl); } for (int j = 0; j < nn; j++) { float val0 = *tmpptr++; float val1 = *tmpptr++; - vfloat32m1_t _w0 = vle32_v_f32m1(kptr0, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, val0, _w0, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, val1, _w0, vl); + vfloat32m1_t _w0 = __riscv_vle32_v_f32m1(kptr0, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, val0, _w0, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, val1, _w0, vl); kptr0 += packn; } - vse32_v_f32m1(outptr0, _sum0, vl); - vse32_v_f32m1(outptr0 + packn, _sum1, vl); + __riscv_vse32_v_f32m1(outptr0, _sum0, vl); + __riscv_vse32_v_f32m1(outptr0 + packn, _sum1, vl); outptr0 += packn * 2; } @@ -337,23 +337,23 @@ static void im2col_sgemm_packn_rvv(const Mat& bottom_im2col, Mat& top_blob, cons int nn = inch * maxk * packn; // inch always > 0 - vfloat32m1_t _sum = vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _sum = __riscv_vfmv_v_f_f32m1(0.f, vl); if (bias) { - _sum = vle32_v_f32m1(bias + p * packn, vl); + _sum = __riscv_vle32_v_f32m1(bias + p * packn, vl); } for (int j = 0; j < nn; j++) { float val = *tmpptr++; - vfloat32m1_t _w0 = vle32_v_f32m1(kptr0, vl); - _sum = vfmacc_vf_f32m1(_sum, val, _w0, vl); + vfloat32m1_t _w0 = __riscv_vle32_v_f32m1(kptr0, vl); + _sum = __riscv_vfmacc_vf_f32m1(_sum, val, _w0, vl); kptr0 += packn; } - vse32_v_f32m1(outptr0, _sum, vl); + __riscv_vse32_v_f32m1(outptr0, _sum, vl); outptr0 += packn; } @@ -363,7 +363,7 @@ static void im2col_sgemm_packn_rvv(const Mat& bottom_im2col, Mat& top_blob, cons static void convolution_im2col_sgemm_packn_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt) { const int packn = csrr_vlenb() / 4; - const size_t vl = vsetvl_e32m1(packn); + const size_t vl = __riscv_vsetvl_e32m1(packn); int w = bottom_blob.w; int inch = bottom_blob.c; @@ -396,8 +396,8 @@ static void convolution_im2col_sgemm_packn_rvv(const Mat& bottom_blob, Mat& top_ int j = 0; for (; j < outw; j++) { - vfloat32m1_t _val = vle32_v_f32m1(sptr, vl); - vse32_v_f32m1(ptr, _val, vl); + vfloat32m1_t _val = __riscv_vle32_v_f32m1(sptr, vl); + __riscv_vse32_v_f32m1(ptr, _val, vl); sptr += stride_w * packn; ptr += packn; diff --git a/src/layer/riscv/convolution_sgemm_packn_fp16s.h b/src/layer/riscv/convolution_sgemm_packn_fp16s.h index cb3b65196ed..8d728aa10f7 100644 --- a/src/layer/riscv/convolution_sgemm_packn_fp16s.h +++ b/src/layer/riscv/convolution_sgemm_packn_fp16s.h @@ -15,7 +15,7 @@ static void im2col_sgemm_packn_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) { const int packn = csrr_vlenb() / 2; - const size_t vl = vsetvl_e16m1(packn); + const size_t vl = __riscv_vsetvl_e16m1(packn); // Mat bottom_im2col(size, maxk, inch, 2u * packn, packn, opt.workspace_allocator); @@ -101,15 +101,15 @@ static void im2col_sgemm_packn_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blo img0 += size * packn; #endif #else - vfloat16m1_t _val0 = vle16_v_f16m1(img0, vl); - vfloat16m1_t _val1 = vle16_v_f16m1(img0 + packn, vl); - vfloat16m1_t _val2 = vle16_v_f16m1(img0 + packn * 2, vl); - vfloat16m1_t _val3 = vle16_v_f16m1(img0 + packn * 3, vl); - vfloat16m1_t _val4 = vle16_v_f16m1(img0 + packn * 4, vl); - vfloat16m1_t _val5 = vle16_v_f16m1(img0 + packn * 5, vl); - vfloat16m1_t _val6 = vle16_v_f16m1(img0 + packn * 6, vl); - vfloat16m1_t _val7 = vle16_v_f16m1(img0 + packn * 7, vl); - vsseg8e16_v_f16m1(tmpptr, _val0, _val1, _val2, _val3, _val4, _val5, _val6, _val7, vl); + vfloat16m1_t _val0 = __riscv_vle16_v_f16m1(img0, vl); + vfloat16m1_t _val1 = __riscv_vle16_v_f16m1(img0 + packn, vl); + vfloat16m1_t _val2 = __riscv_vle16_v_f16m1(img0 + packn * 2, vl); + vfloat16m1_t _val3 = __riscv_vle16_v_f16m1(img0 + packn * 3, vl); + vfloat16m1_t _val4 = __riscv_vle16_v_f16m1(img0 + packn * 4, vl); + vfloat16m1_t _val5 = __riscv_vle16_v_f16m1(img0 + packn * 5, vl); + vfloat16m1_t _val6 = __riscv_vle16_v_f16m1(img0 + packn * 6, vl); + vfloat16m1_t _val7 = __riscv_vle16_v_f16m1(img0 + packn * 7, vl); + __riscv_vsseg8e16_v_f16m1x8(tmpptr, __riscv_vcreate_v_f16m1x8(_val0, _val1, _val2, _val3, _val4, _val5, _val6, _val7), vl); img0 += size * packn; tmpptr += packn * 8; @@ -168,11 +168,11 @@ static void im2col_sgemm_packn_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blo img0 += size * packn; #endif #else - vfloat16m1_t _val0 = vle16_v_f16m1(img0, vl); - vfloat16m1_t _val1 = vle16_v_f16m1(img0 + packn, vl); - vfloat16m1_t _val2 = vle16_v_f16m1(img0 + packn * 2, vl); - vfloat16m1_t _val3 = vle16_v_f16m1(img0 + packn * 3, vl); - vsseg4e16_v_f16m1(tmpptr, _val0, _val1, _val2, _val3, vl); + vfloat16m1_t _val0 = __riscv_vle16_v_f16m1(img0, vl); + vfloat16m1_t _val1 = __riscv_vle16_v_f16m1(img0 + packn, vl); + vfloat16m1_t _val2 = __riscv_vle16_v_f16m1(img0 + packn * 2, vl); + vfloat16m1_t _val3 = __riscv_vle16_v_f16m1(img0 + packn * 3, vl); + __riscv_vsseg4e16_v_f16m1x4(tmpptr, __riscv_vcreate_v_f16m1x4(_val0, _val1, _val2, _val3), vl); img0 += size * packn; tmpptr += packn * 4; @@ -226,9 +226,9 @@ static void im2col_sgemm_packn_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blo img0 += size * packn; #endif #else - vfloat16m1_t _val0 = vle16_v_f16m1(img0, vl); - vfloat16m1_t _val1 = vle16_v_f16m1(img0 + packn, vl); - vsseg2e16_v_f16m1(tmpptr, _val0, _val1, vl); + vfloat16m1_t _val0 = __riscv_vle16_v_f16m1(img0, vl); + vfloat16m1_t _val1 = __riscv_vle16_v_f16m1(img0 + packn, vl); + __riscv_vsseg2e16_v_f16m1x2(tmpptr, __riscv_vcreate_v_f16m1x2(_val0, _val1), vl); img0 += size * packn; tmpptr += packn * 2; @@ -250,8 +250,8 @@ static void im2col_sgemm_packn_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blo for (int k = 0; k < maxk; k++) { - vfloat16m1_t _val = vle16_v_f16m1(img0, vl); - vse16_v_f16m1(tmpptr, _val, vl); + vfloat16m1_t _val = __riscv_vle16_v_f16m1(img0, vl); + __riscv_vse16_v_f16m1(tmpptr, _val, vl); img0 += size * packn; tmpptr += packn; @@ -273,25 +273,25 @@ static void im2col_sgemm_packn_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blo int nn = inch * maxk * packn; // inch always > 0 - vfloat16m1_t _sum0 = vfmv_v_f_f16m1(0.f, vl); - vfloat16m1_t _sum1 = vfmv_v_f_f16m1(0.f, vl); - vfloat16m1_t _sum2 = vfmv_v_f_f16m1(0.f, vl); - vfloat16m1_t _sum3 = vfmv_v_f_f16m1(0.f, vl); - vfloat16m1_t _sum4 = vfmv_v_f_f16m1(0.f, vl); - vfloat16m1_t _sum5 = vfmv_v_f_f16m1(0.f, vl); - vfloat16m1_t _sum6 = vfmv_v_f_f16m1(0.f, vl); - vfloat16m1_t _sum7 = vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _sum0 = __riscv_vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _sum1 = __riscv_vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _sum2 = __riscv_vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _sum3 = __riscv_vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _sum4 = __riscv_vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _sum5 = __riscv_vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _sum6 = __riscv_vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _sum7 = __riscv_vfmv_v_f_f16m1(0.f, vl); if (bias) { - _sum0 = vle16_v_f16m1(bias + p * packn, vl); - _sum1 = vle16_v_f16m1(bias + p * packn, vl); - _sum2 = vle16_v_f16m1(bias + p * packn, vl); - _sum3 = vle16_v_f16m1(bias + p * packn, vl); - _sum4 = vle16_v_f16m1(bias + p * packn, vl); - _sum5 = vle16_v_f16m1(bias + p * packn, vl); - _sum6 = vle16_v_f16m1(bias + p * packn, vl); - _sum7 = vle16_v_f16m1(bias + p * packn, vl); + _sum0 = __riscv_vle16_v_f16m1(bias + p * packn, vl); + _sum1 = __riscv_vle16_v_f16m1(bias + p * packn, vl); + _sum2 = __riscv_vle16_v_f16m1(bias + p * packn, vl); + _sum3 = __riscv_vle16_v_f16m1(bias + p * packn, vl); + _sum4 = __riscv_vle16_v_f16m1(bias + p * packn, vl); + _sum5 = __riscv_vle16_v_f16m1(bias + p * packn, vl); + _sum6 = __riscv_vle16_v_f16m1(bias + p * packn, vl); + _sum7 = __riscv_vle16_v_f16m1(bias + p * packn, vl); } for (int j = 0; j < nn; j++) @@ -304,27 +304,27 @@ static void im2col_sgemm_packn_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blo __fp16 val5 = *tmpptr++; __fp16 val6 = *tmpptr++; __fp16 val7 = *tmpptr++; - vfloat16m1_t _w0 = vle16_v_f16m1(kptr0, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, val0, _w0, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, val1, _w0, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, val2, _w0, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, val3, _w0, vl); - _sum4 = vfmacc_vf_f16m1(_sum4, val4, _w0, vl); - _sum5 = vfmacc_vf_f16m1(_sum5, val5, _w0, vl); - _sum6 = vfmacc_vf_f16m1(_sum6, val6, _w0, vl); - _sum7 = vfmacc_vf_f16m1(_sum7, val7, _w0, vl); + vfloat16m1_t _w0 = __riscv_vle16_v_f16m1(kptr0, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, val0, _w0, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, val1, _w0, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, val2, _w0, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, val3, _w0, vl); + _sum4 = __riscv_vfmacc_vf_f16m1(_sum4, val4, _w0, vl); + _sum5 = __riscv_vfmacc_vf_f16m1(_sum5, val5, _w0, vl); + _sum6 = __riscv_vfmacc_vf_f16m1(_sum6, val6, _w0, vl); + _sum7 = __riscv_vfmacc_vf_f16m1(_sum7, val7, _w0, vl); kptr0 += packn; } - vse16_v_f16m1(outptr0, _sum0, vl); - vse16_v_f16m1(outptr0 + packn, _sum1, vl); - vse16_v_f16m1(outptr0 + packn * 2, _sum2, vl); - vse16_v_f16m1(outptr0 + packn * 3, _sum3, vl); - vse16_v_f16m1(outptr0 + packn * 4, _sum4, vl); - vse16_v_f16m1(outptr0 + packn * 5, _sum5, vl); - vse16_v_f16m1(outptr0 + packn * 6, _sum6, vl); - vse16_v_f16m1(outptr0 + packn * 7, _sum7, vl); + __riscv_vse16_v_f16m1(outptr0, _sum0, vl); + __riscv_vse16_v_f16m1(outptr0 + packn, _sum1, vl); + __riscv_vse16_v_f16m1(outptr0 + packn * 2, _sum2, vl); + __riscv_vse16_v_f16m1(outptr0 + packn * 3, _sum3, vl); + __riscv_vse16_v_f16m1(outptr0 + packn * 4, _sum4, vl); + __riscv_vse16_v_f16m1(outptr0 + packn * 5, _sum5, vl); + __riscv_vse16_v_f16m1(outptr0 + packn * 6, _sum6, vl); + __riscv_vse16_v_f16m1(outptr0 + packn * 7, _sum7, vl); outptr0 += packn * 8; } @@ -335,17 +335,17 @@ static void im2col_sgemm_packn_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blo int nn = inch * maxk * packn; // inch always > 0 - vfloat16m1_t _sum0 = vfmv_v_f_f16m1(0.f, vl); - vfloat16m1_t _sum1 = vfmv_v_f_f16m1(0.f, vl); - vfloat16m1_t _sum2 = vfmv_v_f_f16m1(0.f, vl); - vfloat16m1_t _sum3 = vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _sum0 = __riscv_vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _sum1 = __riscv_vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _sum2 = __riscv_vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _sum3 = __riscv_vfmv_v_f_f16m1(0.f, vl); if (bias) { - _sum0 = vle16_v_f16m1(bias + p * packn, vl); - _sum1 = vle16_v_f16m1(bias + p * packn, vl); - _sum2 = vle16_v_f16m1(bias + p * packn, vl); - _sum3 = vle16_v_f16m1(bias + p * packn, vl); + _sum0 = __riscv_vle16_v_f16m1(bias + p * packn, vl); + _sum1 = __riscv_vle16_v_f16m1(bias + p * packn, vl); + _sum2 = __riscv_vle16_v_f16m1(bias + p * packn, vl); + _sum3 = __riscv_vle16_v_f16m1(bias + p * packn, vl); } for (int j = 0; j < nn; j++) @@ -354,19 +354,19 @@ static void im2col_sgemm_packn_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blo __fp16 val1 = *tmpptr++; __fp16 val2 = *tmpptr++; __fp16 val3 = *tmpptr++; - vfloat16m1_t _w0 = vle16_v_f16m1(kptr0, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, val0, _w0, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, val1, _w0, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, val2, _w0, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, val3, _w0, vl); + vfloat16m1_t _w0 = __riscv_vle16_v_f16m1(kptr0, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, val0, _w0, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, val1, _w0, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, val2, _w0, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, val3, _w0, vl); kptr0 += packn; } - vse16_v_f16m1(outptr0, _sum0, vl); - vse16_v_f16m1(outptr0 + packn, _sum1, vl); - vse16_v_f16m1(outptr0 + packn * 2, _sum2, vl); - vse16_v_f16m1(outptr0 + packn * 3, _sum3, vl); + __riscv_vse16_v_f16m1(outptr0, _sum0, vl); + __riscv_vse16_v_f16m1(outptr0 + packn, _sum1, vl); + __riscv_vse16_v_f16m1(outptr0 + packn * 2, _sum2, vl); + __riscv_vse16_v_f16m1(outptr0 + packn * 3, _sum3, vl); outptr0 += packn * 4; } @@ -377,28 +377,28 @@ static void im2col_sgemm_packn_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blo int nn = inch * maxk * packn; // inch always > 0 - vfloat16m1_t _sum0 = vfmv_v_f_f16m1(0.f, vl); - vfloat16m1_t _sum1 = vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _sum0 = __riscv_vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _sum1 = __riscv_vfmv_v_f_f16m1(0.f, vl); if (bias) { - _sum0 = vle16_v_f16m1(bias + p * packn, vl); - _sum1 = vle16_v_f16m1(bias + p * packn, vl); + _sum0 = __riscv_vle16_v_f16m1(bias + p * packn, vl); + _sum1 = __riscv_vle16_v_f16m1(bias + p * packn, vl); } for (int j = 0; j < nn; j++) { __fp16 val0 = *tmpptr++; __fp16 val1 = *tmpptr++; - vfloat16m1_t _w0 = vle16_v_f16m1(kptr0, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, val0, _w0, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, val1, _w0, vl); + vfloat16m1_t _w0 = __riscv_vle16_v_f16m1(kptr0, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, val0, _w0, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, val1, _w0, vl); kptr0 += packn; } - vse16_v_f16m1(outptr0, _sum0, vl); - vse16_v_f16m1(outptr0 + packn, _sum1, vl); + __riscv_vse16_v_f16m1(outptr0, _sum0, vl); + __riscv_vse16_v_f16m1(outptr0 + packn, _sum1, vl); outptr0 += packn * 2; } @@ -409,23 +409,23 @@ static void im2col_sgemm_packn_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blo int nn = inch * maxk * packn; // inch always > 0 - vfloat16m1_t _sum = vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1(0.f, vl); if (bias) { - _sum = vle16_v_f16m1(bias + p * packn, vl); + _sum = __riscv_vle16_v_f16m1(bias + p * packn, vl); } for (int j = 0; j < nn; j++) { __fp16 val = *tmpptr++; - vfloat16m1_t _w0 = vle16_v_f16m1(kptr0, vl); - _sum = vfmacc_vf_f16m1(_sum, val, _w0, vl); + vfloat16m1_t _w0 = __riscv_vle16_v_f16m1(kptr0, vl); + _sum = __riscv_vfmacc_vf_f16m1(_sum, val, _w0, vl); kptr0 += packn; } - vse16_v_f16m1(outptr0, _sum, vl); + __riscv_vse16_v_f16m1(outptr0, _sum, vl); outptr0 += packn; } @@ -435,7 +435,7 @@ static void im2col_sgemm_packn_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blo static void convolution_im2col_sgemm_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt) { const int packn = csrr_vlenb() / 2; - const size_t vl = vsetvl_e16m1(packn); + const size_t vl = __riscv_vsetvl_e16m1(packn); int w = bottom_blob.w; int inch = bottom_blob.c; @@ -468,8 +468,8 @@ static void convolution_im2col_sgemm_packn_fp16sa_rvv(const Mat& bottom_blob, Ma int j = 0; for (; j < outw; j++) { - vfloat16m1_t _val = vle16_v_f16m1(sptr, vl); - vse16_v_f16m1(ptr, _val, vl); + vfloat16m1_t _val = __riscv_vle16_v_f16m1(sptr, vl); + __riscv_vse16_v_f16m1(ptr, _val, vl); sptr += stride_w * packn; ptr += packn; diff --git a/src/layer/riscv/convolution_sgemm_packnto1.h b/src/layer/riscv/convolution_sgemm_packnto1.h index 2df2c7d7656..869e2b6a210 100644 --- a/src/layer/riscv/convolution_sgemm_packnto1.h +++ b/src/layer/riscv/convolution_sgemm_packnto1.h @@ -15,7 +15,7 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) { const int packn = csrr_vlenb() / 4; - const size_t vl = vsetvl_e32m1(packn); + const size_t vl = __riscv_vsetvl_e32m1(packn); // Mat bottom_im2col(size, maxk, inch, 4u * packn, packn, opt.workspace_allocator); @@ -69,15 +69,15 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c img0 += size * packn; #else - vfloat32m1_t _val0 = vle32_v_f32m1(img0, vl); - vfloat32m1_t _val1 = vle32_v_f32m1(img0 + packn, vl); - vfloat32m1_t _val2 = vle32_v_f32m1(img0 + packn * 2, vl); - vfloat32m1_t _val3 = vle32_v_f32m1(img0 + packn * 3, vl); - vfloat32m1_t _val4 = vle32_v_f32m1(img0 + packn * 4, vl); - vfloat32m1_t _val5 = vle32_v_f32m1(img0 + packn * 5, vl); - vfloat32m1_t _val6 = vle32_v_f32m1(img0 + packn * 6, vl); - vfloat32m1_t _val7 = vle32_v_f32m1(img0 + packn * 7, vl); - vsseg8e32_v_f32m1(tmpptr, _val0, _val1, _val2, _val3, _val4, _val5, _val6, _val7, vl); + vfloat32m1_t _val0 = __riscv_vle32_v_f32m1(img0, vl); + vfloat32m1_t _val1 = __riscv_vle32_v_f32m1(img0 + packn, vl); + vfloat32m1_t _val2 = __riscv_vle32_v_f32m1(img0 + packn * 2, vl); + vfloat32m1_t _val3 = __riscv_vle32_v_f32m1(img0 + packn * 3, vl); + vfloat32m1_t _val4 = __riscv_vle32_v_f32m1(img0 + packn * 4, vl); + vfloat32m1_t _val5 = __riscv_vle32_v_f32m1(img0 + packn * 5, vl); + vfloat32m1_t _val6 = __riscv_vle32_v_f32m1(img0 + packn * 6, vl); + vfloat32m1_t _val7 = __riscv_vle32_v_f32m1(img0 + packn * 7, vl); + __riscv_vsseg8e32_v_f32m1x8(tmpptr, __riscv_vcreate_v_f32m1x8(_val0, _val1, _val2, _val3, _val4, _val5, _val6, _val7), vl); img0 += size * packn; tmpptr += packn * 8; @@ -114,11 +114,11 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c img0 += size * packn; #else - vfloat32m1_t _val0 = vle32_v_f32m1(img0, vl); - vfloat32m1_t _val1 = vle32_v_f32m1(img0 + packn, vl); - vfloat32m1_t _val2 = vle32_v_f32m1(img0 + packn * 2, vl); - vfloat32m1_t _val3 = vle32_v_f32m1(img0 + packn * 3, vl); - vsseg4e32_v_f32m1(tmpptr, _val0, _val1, _val2, _val3, vl); + vfloat32m1_t _val0 = __riscv_vle32_v_f32m1(img0, vl); + vfloat32m1_t _val1 = __riscv_vle32_v_f32m1(img0 + packn, vl); + vfloat32m1_t _val2 = __riscv_vle32_v_f32m1(img0 + packn * 2, vl); + vfloat32m1_t _val3 = __riscv_vle32_v_f32m1(img0 + packn * 3, vl); + __riscv_vsseg4e32_v_f32m1x4(tmpptr, __riscv_vcreate_v_f32m1x4(_val0, _val1, _val2, _val3), vl); img0 += size * packn; tmpptr += packn * 4; @@ -153,9 +153,9 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c img0 += size * packn; #else - vfloat32m1_t _val0 = vle32_v_f32m1(img0, vl); - vfloat32m1_t _val1 = vle32_v_f32m1(img0 + packn, vl); - vsseg2e32_v_f32m1(tmpptr, _val0, _val1, vl); + vfloat32m1_t _val0 = __riscv_vle32_v_f32m1(img0, vl); + vfloat32m1_t _val1 = __riscv_vle32_v_f32m1(img0 + packn, vl); + __riscv_vsseg2e32_v_f32m1x2(tmpptr, __riscv_vcreate_v_f32m1x2(_val0, _val1), vl); img0 += size * packn; tmpptr += packn * 2; @@ -177,8 +177,8 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c for (int k = 0; k < maxk; k++) { - vfloat32m1_t _val = vle32_v_f32m1(img0, vl); - vse32_v_f32m1(tmpptr, _val, vl); + vfloat32m1_t _val = __riscv_vle32_v_f32m1(img0, vl); + __riscv_vse32_v_f32m1(tmpptr, _val, vl); img0 += size * packn; tmpptr += packn; @@ -220,14 +220,14 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c int nn = inch * maxk * packn; // inch always > 0 - vfloat32m1_t _sum0 = vle32_v_f32m1(biasptr, vl); - vfloat32m1_t _sum1 = vle32_v_f32m1(biasptr, vl); - vfloat32m1_t _sum2 = vle32_v_f32m1(biasptr, vl); - vfloat32m1_t _sum3 = vle32_v_f32m1(biasptr, vl); - vfloat32m1_t _sum4 = vle32_v_f32m1(biasptr, vl); - vfloat32m1_t _sum5 = vle32_v_f32m1(biasptr, vl); - vfloat32m1_t _sum6 = vle32_v_f32m1(biasptr, vl); - vfloat32m1_t _sum7 = vle32_v_f32m1(biasptr, vl); + vfloat32m1_t _sum0 = __riscv_vle32_v_f32m1(biasptr, vl); + vfloat32m1_t _sum1 = __riscv_vle32_v_f32m1(biasptr, vl); + vfloat32m1_t _sum2 = __riscv_vle32_v_f32m1(biasptr, vl); + vfloat32m1_t _sum3 = __riscv_vle32_v_f32m1(biasptr, vl); + vfloat32m1_t _sum4 = __riscv_vle32_v_f32m1(biasptr, vl); + vfloat32m1_t _sum5 = __riscv_vle32_v_f32m1(biasptr, vl); + vfloat32m1_t _sum6 = __riscv_vle32_v_f32m1(biasptr, vl); + vfloat32m1_t _sum7 = __riscv_vle32_v_f32m1(biasptr, vl); for (int j = 0; j < nn; j++) { @@ -239,30 +239,30 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c float val5 = *tmpptr++; float val6 = *tmpptr++; float val7 = *tmpptr++; - vfloat32m1_t _w0 = vle32_v_f32m1(kptr0, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, val0, _w0, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, val1, _w0, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, val2, _w0, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, val3, _w0, vl); - _sum4 = vfmacc_vf_f32m1(_sum4, val4, _w0, vl); - _sum5 = vfmacc_vf_f32m1(_sum5, val5, _w0, vl); - _sum6 = vfmacc_vf_f32m1(_sum6, val6, _w0, vl); - _sum7 = vfmacc_vf_f32m1(_sum7, val7, _w0, vl); + vfloat32m1_t _w0 = __riscv_vle32_v_f32m1(kptr0, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, val0, _w0, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, val1, _w0, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, val2, _w0, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, val3, _w0, vl); + _sum4 = __riscv_vfmacc_vf_f32m1(_sum4, val4, _w0, vl); + _sum5 = __riscv_vfmacc_vf_f32m1(_sum5, val5, _w0, vl); + _sum6 = __riscv_vfmacc_vf_f32m1(_sum6, val6, _w0, vl); + _sum7 = __riscv_vfmacc_vf_f32m1(_sum7, val7, _w0, vl); kptr0 += packn; } #if C906 - vsse32_v_f32m1(outptr0, top_blob.cstep * sizeof(float), _sum0, vl); - vsse32_v_f32m1(outptr0 + 1, top_blob.cstep * sizeof(float), _sum1, vl); - vsse32_v_f32m1(outptr0 + 2, top_blob.cstep * sizeof(float), _sum2, vl); - vsse32_v_f32m1(outptr0 + 3, top_blob.cstep * sizeof(float), _sum3, vl); - vsse32_v_f32m1(outptr0 + 4, top_blob.cstep * sizeof(float), _sum4, vl); - vsse32_v_f32m1(outptr0 + 5, top_blob.cstep * sizeof(float), _sum5, vl); - vsse32_v_f32m1(outptr0 + 6, top_blob.cstep * sizeof(float), _sum6, vl); - vsse32_v_f32m1(outptr0 + 7, top_blob.cstep * sizeof(float), _sum7, vl); + __riscv_vsse32_v_f32m1(outptr0, top_blob.cstep * sizeof(float), _sum0, vl); + __riscv_vsse32_v_f32m1(outptr0 + 1, top_blob.cstep * sizeof(float), _sum1, vl); + __riscv_vsse32_v_f32m1(outptr0 + 2, top_blob.cstep * sizeof(float), _sum2, vl); + __riscv_vsse32_v_f32m1(outptr0 + 3, top_blob.cstep * sizeof(float), _sum3, vl); + __riscv_vsse32_v_f32m1(outptr0 + 4, top_blob.cstep * sizeof(float), _sum4, vl); + __riscv_vsse32_v_f32m1(outptr0 + 5, top_blob.cstep * sizeof(float), _sum5, vl); + __riscv_vsse32_v_f32m1(outptr0 + 6, top_blob.cstep * sizeof(float), _sum6, vl); + __riscv_vsse32_v_f32m1(outptr0 + 7, top_blob.cstep * sizeof(float), _sum7, vl); #else - vssseg8e32_v_f32m1(outptr0, top_blob.cstep * sizeof(float), _sum0, _sum1, _sum2, _sum3, _sum4, _sum5, _sum6, _sum7, vl); + __riscv_vssseg8e32_v_f32m1x8(outptr0, top_blob.cstep * sizeof(float), __riscv_vcreate_v_f32m1x8(_sum0, _sum1, _sum2, _sum3, _sum4, _sum5, _sum6, _sum7), vl); #endif outptr0 += 8; } @@ -273,10 +273,10 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c int nn = inch * maxk * packn; // inch always > 0 - vfloat32m1_t _sum0 = vle32_v_f32m1(biasptr, vl); - vfloat32m1_t _sum1 = vle32_v_f32m1(biasptr, vl); - vfloat32m1_t _sum2 = vle32_v_f32m1(biasptr, vl); - vfloat32m1_t _sum3 = vle32_v_f32m1(biasptr, vl); + vfloat32m1_t _sum0 = __riscv_vle32_v_f32m1(biasptr, vl); + vfloat32m1_t _sum1 = __riscv_vle32_v_f32m1(biasptr, vl); + vfloat32m1_t _sum2 = __riscv_vle32_v_f32m1(biasptr, vl); + vfloat32m1_t _sum3 = __riscv_vle32_v_f32m1(biasptr, vl); for (int j = 0; j < nn; j++) { @@ -284,22 +284,22 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c float val1 = *tmpptr++; float val2 = *tmpptr++; float val3 = *tmpptr++; - vfloat32m1_t _w0 = vle32_v_f32m1(kptr0, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, val0, _w0, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, val1, _w0, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, val2, _w0, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, val3, _w0, vl); + vfloat32m1_t _w0 = __riscv_vle32_v_f32m1(kptr0, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, val0, _w0, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, val1, _w0, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, val2, _w0, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, val3, _w0, vl); kptr0 += packn; } #if C906 - vsse32_v_f32m1(outptr0, top_blob.cstep * sizeof(float), _sum0, vl); - vsse32_v_f32m1(outptr0 + 1, top_blob.cstep * sizeof(float), _sum1, vl); - vsse32_v_f32m1(outptr0 + 2, top_blob.cstep * sizeof(float), _sum2, vl); - vsse32_v_f32m1(outptr0 + 3, top_blob.cstep * sizeof(float), _sum3, vl); + __riscv_vsse32_v_f32m1(outptr0, top_blob.cstep * sizeof(float), _sum0, vl); + __riscv_vsse32_v_f32m1(outptr0 + 1, top_blob.cstep * sizeof(float), _sum1, vl); + __riscv_vsse32_v_f32m1(outptr0 + 2, top_blob.cstep * sizeof(float), _sum2, vl); + __riscv_vsse32_v_f32m1(outptr0 + 3, top_blob.cstep * sizeof(float), _sum3, vl); #else - vssseg4e32_v_f32m1(outptr0, top_blob.cstep * sizeof(float), _sum0, _sum1, _sum2, _sum3, vl); + __riscv_vssseg4e32_v_f32m1x4(outptr0, top_blob.cstep * sizeof(float), __riscv_vcreate_v_f32m1x4(_sum0, _sum1, _sum2, _sum3), vl); #endif outptr0 += 4; } @@ -310,25 +310,25 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c int nn = inch * maxk * packn; // inch always > 0 - vfloat32m1_t _sum0 = vle32_v_f32m1(biasptr, vl); - vfloat32m1_t _sum1 = vle32_v_f32m1(biasptr, vl); + vfloat32m1_t _sum0 = __riscv_vle32_v_f32m1(biasptr, vl); + vfloat32m1_t _sum1 = __riscv_vle32_v_f32m1(biasptr, vl); for (int j = 0; j < nn; j++) { float val0 = *tmpptr++; float val1 = *tmpptr++; - vfloat32m1_t _w0 = vle32_v_f32m1(kptr0, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, val0, _w0, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, val1, _w0, vl); + vfloat32m1_t _w0 = __riscv_vle32_v_f32m1(kptr0, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, val0, _w0, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, val1, _w0, vl); kptr0 += packn; } #if C906 - vsse32_v_f32m1(outptr0, top_blob.cstep * sizeof(float), _sum0, vl); - vsse32_v_f32m1(outptr0 + 1, top_blob.cstep * sizeof(float), _sum1, vl); + __riscv_vsse32_v_f32m1(outptr0, top_blob.cstep * sizeof(float), _sum0, vl); + __riscv_vsse32_v_f32m1(outptr0 + 1, top_blob.cstep * sizeof(float), _sum1, vl); #else - vssseg2e32_v_f32m1(outptr0, top_blob.cstep * sizeof(float), _sum0, _sum1, vl); + __riscv_vssseg2e32_v_f32m1x2(outptr0, top_blob.cstep * sizeof(float), __riscv_vcreate_v_f32m1x2(_sum0, _sum1), vl); #endif outptr0 += 2; } @@ -339,18 +339,18 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c int nn = inch * maxk * packn; // inch always > 0 - vfloat32m1_t _sum = vle32_v_f32m1(biasptr, vl); + vfloat32m1_t _sum = __riscv_vle32_v_f32m1(biasptr, vl); for (int j = 0; j < nn; j++) { float val = *tmpptr++; - vfloat32m1_t _w0 = vle32_v_f32m1(kptr0, vl); - _sum = vfmacc_vf_f32m1(_sum, val, _w0, vl); + vfloat32m1_t _w0 = __riscv_vle32_v_f32m1(kptr0, vl); + _sum = __riscv_vfmacc_vf_f32m1(_sum, val, _w0, vl); kptr0 += packn; } - vsse32_v_f32m1(outptr0, top_blob.cstep * sizeof(float), _sum, vl); + __riscv_vsse32_v_f32m1(outptr0, top_blob.cstep * sizeof(float), _sum, vl); outptr0 += 1; } @@ -383,35 +383,27 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c float sum6 = bias0; float sum7 = bias0; - vfloat32m1_t _sum0 = vfmv_v_f_f32m1(0.f, vl); - vfloat32m1_t _sum1 = vfmv_v_f_f32m1(0.f, vl); - vfloat32m1_t _sum2 = vfmv_v_f_f32m1(0.f, vl); - vfloat32m1_t _sum3 = vfmv_v_f_f32m1(0.f, vl); - vfloat32m1_t _sum4 = vfmv_v_f_f32m1(0.f, vl); - vfloat32m1_t _sum5 = vfmv_v_f_f32m1(0.f, vl); - vfloat32m1_t _sum6 = vfmv_v_f_f32m1(0.f, vl); - vfloat32m1_t _sum7 = vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _sum0 = __riscv_vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _sum1 = __riscv_vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _sum2 = __riscv_vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _sum3 = __riscv_vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _sum4 = __riscv_vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _sum5 = __riscv_vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _sum6 = __riscv_vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _sum7 = __riscv_vfmv_v_f_f32m1(0.f, vl); for (int j = 0; j < nn; j++) { - vfloat32m1_t _val0; - vfloat32m1_t _val1; - vfloat32m1_t _val2; - vfloat32m1_t _val3; - vfloat32m1_t _val4; - vfloat32m1_t _val5; - vfloat32m1_t _val6; - vfloat32m1_t _val7; - vlseg8e32_v_f32m1(&_val0, &_val1, &_val2, &_val3, &_val4, &_val5, &_val6, &_val7, tmpptr, vl); - vfloat32m1_t _w0 = vle32_v_f32m1(kptr0, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, _val0, _w0, vl); - _sum1 = vfmacc_vv_f32m1(_sum1, _val1, _w0, vl); - _sum2 = vfmacc_vv_f32m1(_sum2, _val2, _w0, vl); - _sum3 = vfmacc_vv_f32m1(_sum3, _val3, _w0, vl); - _sum4 = vfmacc_vv_f32m1(_sum4, _val4, _w0, vl); - _sum5 = vfmacc_vv_f32m1(_sum5, _val5, _w0, vl); - _sum6 = vfmacc_vv_f32m1(_sum6, _val6, _w0, vl); - _sum7 = vfmacc_vv_f32m1(_sum7, _val7, _w0, vl); + vfloat32m1x8_t _val = __riscv_vlseg8e32_v_f32m1x8(tmpptr, vl); + vfloat32m1_t _w0 = __riscv_vle32_v_f32m1(kptr0, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, __riscv_vget_v_f32m1x8_f32m1(_val, 0), _w0, vl); + _sum1 = __riscv_vfmacc_vv_f32m1(_sum1, __riscv_vget_v_f32m1x8_f32m1(_val, 1), _w0, vl); + _sum2 = __riscv_vfmacc_vv_f32m1(_sum2, __riscv_vget_v_f32m1x8_f32m1(_val, 2), _w0, vl); + _sum3 = __riscv_vfmacc_vv_f32m1(_sum3, __riscv_vget_v_f32m1x8_f32m1(_val, 3), _w0, vl); + _sum4 = __riscv_vfmacc_vv_f32m1(_sum4, __riscv_vget_v_f32m1x8_f32m1(_val, 4), _w0, vl); + _sum5 = __riscv_vfmacc_vv_f32m1(_sum5, __riscv_vget_v_f32m1x8_f32m1(_val, 5), _w0, vl); + _sum6 = __riscv_vfmacc_vv_f32m1(_sum6, __riscv_vget_v_f32m1x8_f32m1(_val, 6), _w0, vl); + _sum7 = __riscv_vfmacc_vv_f32m1(_sum7, __riscv_vget_v_f32m1x8_f32m1(_val, 7), _w0, vl); tmpptr += packn * 8; kptr0 += packn; } @@ -426,14 +418,14 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c std::vector ss5(packn); std::vector ss6(packn); std::vector ss7(packn); - vse32_v_f32m1((float*)ss0.data(), _sum0, vl); - vse32_v_f32m1((float*)ss1.data(), _sum1, vl); - vse32_v_f32m1((float*)ss2.data(), _sum2, vl); - vse32_v_f32m1((float*)ss3.data(), _sum3, vl); - vse32_v_f32m1((float*)ss4.data(), _sum4, vl); - vse32_v_f32m1((float*)ss5.data(), _sum5, vl); - vse32_v_f32m1((float*)ss6.data(), _sum6, vl); - vse32_v_f32m1((float*)ss7.data(), _sum7, vl); + __riscv_vse32_v_f32m1((float*)ss0.data(), _sum0, vl); + __riscv_vse32_v_f32m1((float*)ss1.data(), _sum1, vl); + __riscv_vse32_v_f32m1((float*)ss2.data(), _sum2, vl); + __riscv_vse32_v_f32m1((float*)ss3.data(), _sum3, vl); + __riscv_vse32_v_f32m1((float*)ss4.data(), _sum4, vl); + __riscv_vse32_v_f32m1((float*)ss5.data(), _sum5, vl); + __riscv_vse32_v_f32m1((float*)ss6.data(), _sum6, vl); + __riscv_vse32_v_f32m1((float*)ss7.data(), _sum7, vl); for (int i = 0; i < packn; i++) { sum0 += ss0[i]; @@ -446,14 +438,14 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c sum7 += ss7[i]; } #else - sum0 = vfmv_f_s_f32m1_f32(vfredusum_vs_f32m1_f32m1(vfloat32m1_t(), _sum0, vfmv_s_f_f32m1(vfloat32m1_t(), sum0, vl), vl)); - sum1 = vfmv_f_s_f32m1_f32(vfredusum_vs_f32m1_f32m1(vfloat32m1_t(), _sum1, vfmv_s_f_f32m1(vfloat32m1_t(), sum1, vl), vl)); - sum2 = vfmv_f_s_f32m1_f32(vfredusum_vs_f32m1_f32m1(vfloat32m1_t(), _sum2, vfmv_s_f_f32m1(vfloat32m1_t(), sum2, vl), vl)); - sum3 = vfmv_f_s_f32m1_f32(vfredusum_vs_f32m1_f32m1(vfloat32m1_t(), _sum3, vfmv_s_f_f32m1(vfloat32m1_t(), sum3, vl), vl)); - sum4 = vfmv_f_s_f32m1_f32(vfredusum_vs_f32m1_f32m1(vfloat32m1_t(), _sum4, vfmv_s_f_f32m1(vfloat32m1_t(), sum4, vl), vl)); - sum5 = vfmv_f_s_f32m1_f32(vfredusum_vs_f32m1_f32m1(vfloat32m1_t(), _sum5, vfmv_s_f_f32m1(vfloat32m1_t(), sum5, vl), vl)); - sum6 = vfmv_f_s_f32m1_f32(vfredusum_vs_f32m1_f32m1(vfloat32m1_t(), _sum6, vfmv_s_f_f32m1(vfloat32m1_t(), sum6, vl), vl)); - sum7 = vfmv_f_s_f32m1_f32(vfredusum_vs_f32m1_f32m1(vfloat32m1_t(), _sum7, vfmv_s_f_f32m1(vfloat32m1_t(), sum7, vl), vl)); + sum0 = __riscv_vfmv_f_s_f32m1_f32(__riscv_vfredusum_vs_f32m1_f32m1(_sum0, __riscv_vfmv_s_f_f32m1(sum0, vl), vl)); + sum1 = __riscv_vfmv_f_s_f32m1_f32(__riscv_vfredusum_vs_f32m1_f32m1(_sum1, __riscv_vfmv_s_f_f32m1(sum1, vl), vl)); + sum2 = __riscv_vfmv_f_s_f32m1_f32(__riscv_vfredusum_vs_f32m1_f32m1(_sum2, __riscv_vfmv_s_f_f32m1(sum2, vl), vl)); + sum3 = __riscv_vfmv_f_s_f32m1_f32(__riscv_vfredusum_vs_f32m1_f32m1(_sum3, __riscv_vfmv_s_f_f32m1(sum3, vl), vl)); + sum4 = __riscv_vfmv_f_s_f32m1_f32(__riscv_vfredusum_vs_f32m1_f32m1(_sum4, __riscv_vfmv_s_f_f32m1(sum4, vl), vl)); + sum5 = __riscv_vfmv_f_s_f32m1_f32(__riscv_vfredusum_vs_f32m1_f32m1(_sum5, __riscv_vfmv_s_f_f32m1(sum5, vl), vl)); + sum6 = __riscv_vfmv_f_s_f32m1_f32(__riscv_vfredusum_vs_f32m1_f32m1(_sum6, __riscv_vfmv_s_f_f32m1(sum6, vl), vl)); + sum7 = __riscv_vfmv_f_s_f32m1_f32(__riscv_vfredusum_vs_f32m1_f32m1(_sum7, __riscv_vfmv_s_f_f32m1(sum7, vl), vl)); #endif outptr0[0] = sum0; @@ -479,23 +471,19 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c float sum2 = bias0; float sum3 = bias0; - vfloat32m1_t _sum0 = vfmv_v_f_f32m1(0.f, vl); - vfloat32m1_t _sum1 = vfmv_v_f_f32m1(0.f, vl); - vfloat32m1_t _sum2 = vfmv_v_f_f32m1(0.f, vl); - vfloat32m1_t _sum3 = vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _sum0 = __riscv_vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _sum1 = __riscv_vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _sum2 = __riscv_vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _sum3 = __riscv_vfmv_v_f_f32m1(0.f, vl); for (int j = 0; j < nn; j++) { - vfloat32m1_t _val0; - vfloat32m1_t _val1; - vfloat32m1_t _val2; - vfloat32m1_t _val3; - vlseg4e32_v_f32m1(&_val0, &_val1, &_val2, &_val3, tmpptr, vl); - vfloat32m1_t _w0 = vle32_v_f32m1(kptr0, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, _val0, _w0, vl); - _sum1 = vfmacc_vv_f32m1(_sum1, _val1, _w0, vl); - _sum2 = vfmacc_vv_f32m1(_sum2, _val2, _w0, vl); - _sum3 = vfmacc_vv_f32m1(_sum3, _val3, _w0, vl); + vfloat32m1x4_t _val = __riscv_vlseg4e32_v_f32m1x4(tmpptr, vl); + vfloat32m1_t _w0 = __riscv_vle32_v_f32m1(kptr0, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, __riscv_vget_v_f32m1x4_f32m1(_val, 0), _w0, vl); + _sum1 = __riscv_vfmacc_vv_f32m1(_sum1, __riscv_vget_v_f32m1x4_f32m1(_val, 1), _w0, vl); + _sum2 = __riscv_vfmacc_vv_f32m1(_sum2, __riscv_vget_v_f32m1x4_f32m1(_val, 2), _w0, vl); + _sum3 = __riscv_vfmacc_vv_f32m1(_sum3, __riscv_vget_v_f32m1x4_f32m1(_val, 3), _w0, vl); tmpptr += packn * 4; kptr0 += packn; } @@ -506,10 +494,10 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c std::vector ss1(packn); std::vector ss2(packn); std::vector ss3(packn); - vse32_v_f32m1((float*)ss0.data(), _sum0, vl); - vse32_v_f32m1((float*)ss1.data(), _sum1, vl); - vse32_v_f32m1((float*)ss2.data(), _sum2, vl); - vse32_v_f32m1((float*)ss3.data(), _sum3, vl); + __riscv_vse32_v_f32m1((float*)ss0.data(), _sum0, vl); + __riscv_vse32_v_f32m1((float*)ss1.data(), _sum1, vl); + __riscv_vse32_v_f32m1((float*)ss2.data(), _sum2, vl); + __riscv_vse32_v_f32m1((float*)ss3.data(), _sum3, vl); for (int i = 0; i < packn; i++) { sum0 += ss0[i]; @@ -518,10 +506,10 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c sum3 += ss3[i]; } #else - sum0 = vfmv_f_s_f32m1_f32(vfredusum_vs_f32m1_f32m1(vfloat32m1_t(), _sum0, vfmv_s_f_f32m1(vfloat32m1_t(), sum0, vl), vl)); - sum1 = vfmv_f_s_f32m1_f32(vfredusum_vs_f32m1_f32m1(vfloat32m1_t(), _sum1, vfmv_s_f_f32m1(vfloat32m1_t(), sum1, vl), vl)); - sum2 = vfmv_f_s_f32m1_f32(vfredusum_vs_f32m1_f32m1(vfloat32m1_t(), _sum2, vfmv_s_f_f32m1(vfloat32m1_t(), sum2, vl), vl)); - sum3 = vfmv_f_s_f32m1_f32(vfredusum_vs_f32m1_f32m1(vfloat32m1_t(), _sum3, vfmv_s_f_f32m1(vfloat32m1_t(), sum3, vl), vl)); + sum0 = __riscv_vfmv_f_s_f32m1_f32(__riscv_vfredusum_vs_f32m1_f32m1(_sum0, __riscv_vfmv_s_f_f32m1(sum0, vl), vl)); + sum1 = __riscv_vfmv_f_s_f32m1_f32(__riscv_vfredusum_vs_f32m1_f32m1(_sum1, __riscv_vfmv_s_f_f32m1(sum1, vl), vl)); + sum2 = __riscv_vfmv_f_s_f32m1_f32(__riscv_vfredusum_vs_f32m1_f32m1(_sum2, __riscv_vfmv_s_f_f32m1(sum2, vl), vl)); + sum3 = __riscv_vfmv_f_s_f32m1_f32(__riscv_vfredusum_vs_f32m1_f32m1(_sum3, __riscv_vfmv_s_f_f32m1(sum3, vl), vl)); #endif outptr0[0] = sum0; @@ -541,17 +529,15 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c float sum0 = bias0; float sum1 = bias0; - vfloat32m1_t _sum0 = vfmv_v_f_f32m1(0.f, vl); - vfloat32m1_t _sum1 = vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _sum0 = __riscv_vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _sum1 = __riscv_vfmv_v_f_f32m1(0.f, vl); for (int j = 0; j < nn; j++) { - vfloat32m1_t _val0; - vfloat32m1_t _val1; - vlseg2e32_v_f32m1(&_val0, &_val1, tmpptr, vl); - vfloat32m1_t _w0 = vle32_v_f32m1(kptr0, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, _val0, _w0, vl); - _sum1 = vfmacc_vv_f32m1(_sum1, _val1, _w0, vl); + vfloat32m1x2_t _val = __riscv_vlseg2e32_v_f32m1x2(tmpptr, vl); + vfloat32m1_t _w0 = __riscv_vle32_v_f32m1(kptr0, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, __riscv_vget_v_f32m1x2_f32m1(_val, 0), _w0, vl); + _sum1 = __riscv_vfmacc_vv_f32m1(_sum1, __riscv_vget_v_f32m1x2_f32m1(_val, 1), _w0, vl); tmpptr += packn * 2; kptr0 += packn; } @@ -560,16 +546,16 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c // TODO std::vector ss0(packn); std::vector ss1(packn); - vse32_v_f32m1((float*)ss0.data(), _sum0, vl); - vse32_v_f32m1((float*)ss1.data(), _sum1, vl); + __riscv_vse32_v_f32m1((float*)ss0.data(), _sum0, vl); + __riscv_vse32_v_f32m1((float*)ss1.data(), _sum1, vl); for (int i = 0; i < packn; i++) { sum0 += ss0[i]; sum1 += ss1[i]; } #else - sum0 = vfmv_f_s_f32m1_f32(vfredusum_vs_f32m1_f32m1(vfloat32m1_t(), _sum0, vfmv_s_f_f32m1(vfloat32m1_t(), sum0, vl), vl)); - sum1 = vfmv_f_s_f32m1_f32(vfredusum_vs_f32m1_f32m1(vfloat32m1_t(), _sum1, vfmv_s_f_f32m1(vfloat32m1_t(), sum1, vl), vl)); + sum0 = __riscv_vfmv_f_s_f32m1_f32(__riscv_vfredusum_vs_f32m1_f32m1(_sum0, __riscv_vfmv_s_f_f32m1(sum0, vl), vl)); + sum1 = __riscv_vfmv_f_s_f32m1_f32(__riscv_vfredusum_vs_f32m1_f32m1(_sum1, __riscv_vfmv_s_f_f32m1(sum1, vl), vl)); #endif outptr0[0] = sum0; @@ -586,13 +572,13 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c float sum0 = bias0; - vfloat32m1_t _sum0 = vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _sum0 = __riscv_vfmv_v_f_f32m1(0.f, vl); for (int j = 0; j < nn; j++) { - vfloat32m1_t _val0 = vle32_v_f32m1(tmpptr, vl); - vfloat32m1_t _w0 = vle32_v_f32m1(kptr0, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, _val0, _w0, vl); + vfloat32m1_t _val0 = __riscv_vle32_v_f32m1(tmpptr, vl); + vfloat32m1_t _w0 = __riscv_vle32_v_f32m1(kptr0, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _val0, _w0, vl); tmpptr += packn; kptr0 += packn; } @@ -600,13 +586,13 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c #if C906 // TODO std::vector ss0(packn); - vse32_v_f32m1((float*)ss0.data(), _sum0, vl); + __riscv_vse32_v_f32m1((float*)ss0.data(), _sum0, vl); for (int i = 0; i < packn; i++) { sum0 += ss0[i]; } #else - sum0 = vfmv_f_s_f32m1_f32(vfredusum_vs_f32m1_f32m1(vfloat32m1_t(), _sum0, vfmv_s_f_f32m1(vfloat32m1_t(), sum0, vl), vl)); + sum0 = __riscv_vfmv_f_s_f32m1_f32(__riscv_vfredusum_vs_f32m1_f32m1(_sum0, __riscv_vfmv_s_f_f32m1(sum0, vl), vl)); #endif outptr0[0] = sum0; @@ -677,7 +663,7 @@ static void convolution_im2col_sgemm_transform_kernel_packnto1_rvv(const Mat& _k static void convolution_im2col_sgemm_packnto1_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt) { const int packn = csrr_vlenb() / 4; - const size_t vl = vsetvl_e32m1(packn); + const size_t vl = __riscv_vsetvl_e32m1(packn); int w = bottom_blob.w; int inch = bottom_blob.c; @@ -710,8 +696,8 @@ static void convolution_im2col_sgemm_packnto1_rvv(const Mat& bottom_blob, Mat& t int j = 0; for (; j < outw; j++) { - vfloat32m1_t _val = vle32_v_f32m1(sptr, vl); - vse32_v_f32m1(ptr, _val, vl); + vfloat32m1_t _val = __riscv_vle32_v_f32m1(sptr, vl); + __riscv_vse32_v_f32m1(ptr, _val, vl); sptr += stride_w * packn; ptr += packn; diff --git a/src/layer/riscv/convolution_sgemm_packnto1_fp16s.h b/src/layer/riscv/convolution_sgemm_packnto1_fp16s.h index 925713d9826..76d153fd4c4 100644 --- a/src/layer/riscv/convolution_sgemm_packnto1_fp16s.h +++ b/src/layer/riscv/convolution_sgemm_packnto1_fp16s.h @@ -15,7 +15,7 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) { const int packn = csrr_vlenb() / 2; - const size_t vl = vsetvl_e16m1(packn); + const size_t vl = __riscv_vsetvl_e16m1(packn); // Mat bottom_im2col(size, maxk, inch, 2u * packn, packn, opt.workspace_allocator); @@ -69,15 +69,15 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_ img0 += size * packn; #else - vfloat16m1_t _val0 = vle16_v_f16m1(img0, vl); - vfloat16m1_t _val1 = vle16_v_f16m1(img0 + packn, vl); - vfloat16m1_t _val2 = vle16_v_f16m1(img0 + packn * 2, vl); - vfloat16m1_t _val3 = vle16_v_f16m1(img0 + packn * 3, vl); - vfloat16m1_t _val4 = vle16_v_f16m1(img0 + packn * 4, vl); - vfloat16m1_t _val5 = vle16_v_f16m1(img0 + packn * 5, vl); - vfloat16m1_t _val6 = vle16_v_f16m1(img0 + packn * 6, vl); - vfloat16m1_t _val7 = vle16_v_f16m1(img0 + packn * 7, vl); - vsseg8e16_v_f16m1(tmpptr, _val0, _val1, _val2, _val3, _val4, _val5, _val6, _val7, vl); + vfloat16m1_t _val0 = __riscv_vle16_v_f16m1(img0, vl); + vfloat16m1_t _val1 = __riscv_vle16_v_f16m1(img0 + packn, vl); + vfloat16m1_t _val2 = __riscv_vle16_v_f16m1(img0 + packn * 2, vl); + vfloat16m1_t _val3 = __riscv_vle16_v_f16m1(img0 + packn * 3, vl); + vfloat16m1_t _val4 = __riscv_vle16_v_f16m1(img0 + packn * 4, vl); + vfloat16m1_t _val5 = __riscv_vle16_v_f16m1(img0 + packn * 5, vl); + vfloat16m1_t _val6 = __riscv_vle16_v_f16m1(img0 + packn * 6, vl); + vfloat16m1_t _val7 = __riscv_vle16_v_f16m1(img0 + packn * 7, vl); + __riscv_vsseg8e16_v_f16m1x8(tmpptr, __riscv_vcreate_v_f16m1x8(_val0, _val1, _val2, _val3, _val4, _val5, _val6, _val7), vl); img0 += size * packn; tmpptr += packn * 8; @@ -114,11 +114,11 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_ img0 += size * packn; #else - vfloat16m1_t _val0 = vle16_v_f16m1(img0, vl); - vfloat16m1_t _val1 = vle16_v_f16m1(img0 + packn, vl); - vfloat16m1_t _val2 = vle16_v_f16m1(img0 + packn * 2, vl); - vfloat16m1_t _val3 = vle16_v_f16m1(img0 + packn * 3, vl); - vsseg4e16_v_f16m1(tmpptr, _val0, _val1, _val2, _val3, vl); + vfloat16m1_t _val0 = __riscv_vle16_v_f16m1(img0, vl); + vfloat16m1_t _val1 = __riscv_vle16_v_f16m1(img0 + packn, vl); + vfloat16m1_t _val2 = __riscv_vle16_v_f16m1(img0 + packn * 2, vl); + vfloat16m1_t _val3 = __riscv_vle16_v_f16m1(img0 + packn * 3, vl); + __riscv_vsseg4e16_v_f16m1x4(tmpptr, __riscv_vcreate_v_f16m1x4(_val0, _val1, _val2, _val3), vl); img0 += size * packn; tmpptr += packn * 4; @@ -153,9 +153,9 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_ img0 += size * packn; #else - vfloat16m1_t _val0 = vle16_v_f16m1(img0, vl); - vfloat16m1_t _val1 = vle16_v_f16m1(img0 + packn, vl); - vsseg2e16_v_f16m1(tmpptr, _val0, _val1, vl); + vfloat16m1_t _val0 = __riscv_vle16_v_f16m1(img0, vl); + vfloat16m1_t _val1 = __riscv_vle16_v_f16m1(img0 + packn, vl); + __riscv_vsseg2e16_v_f16m1x2(tmpptr, __riscv_vcreate_v_f16m1x2(_val0, _val1), vl); img0 += size * packn; tmpptr += packn * 2; @@ -177,8 +177,8 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_ for (int k = 0; k < maxk; k++) { - vfloat16m1_t _val = vle16_v_f16m1(img0, vl); - vse16_v_f16m1(tmpptr, _val, vl); + vfloat16m1_t _val = __riscv_vle16_v_f16m1(img0, vl); + __riscv_vse16_v_f16m1(tmpptr, _val, vl); img0 += size * packn; tmpptr += packn; @@ -220,14 +220,14 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_ int nn = inch * maxk * packn; // inch always > 0 - vfloat16m1_t _sum0 = vle16_v_f16m1(biasptr, vl); - vfloat16m1_t _sum1 = vle16_v_f16m1(biasptr, vl); - vfloat16m1_t _sum2 = vle16_v_f16m1(biasptr, vl); - vfloat16m1_t _sum3 = vle16_v_f16m1(biasptr, vl); - vfloat16m1_t _sum4 = vle16_v_f16m1(biasptr, vl); - vfloat16m1_t _sum5 = vle16_v_f16m1(biasptr, vl); - vfloat16m1_t _sum6 = vle16_v_f16m1(biasptr, vl); - vfloat16m1_t _sum7 = vle16_v_f16m1(biasptr, vl); + vfloat16m1_t _sum0 = __riscv_vle16_v_f16m1(biasptr, vl); + vfloat16m1_t _sum1 = __riscv_vle16_v_f16m1(biasptr, vl); + vfloat16m1_t _sum2 = __riscv_vle16_v_f16m1(biasptr, vl); + vfloat16m1_t _sum3 = __riscv_vle16_v_f16m1(biasptr, vl); + vfloat16m1_t _sum4 = __riscv_vle16_v_f16m1(biasptr, vl); + vfloat16m1_t _sum5 = __riscv_vle16_v_f16m1(biasptr, vl); + vfloat16m1_t _sum6 = __riscv_vle16_v_f16m1(biasptr, vl); + vfloat16m1_t _sum7 = __riscv_vle16_v_f16m1(biasptr, vl); for (int j = 0; j < nn; j++) { @@ -239,30 +239,30 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_ __fp16 val5 = *tmpptr++; __fp16 val6 = *tmpptr++; __fp16 val7 = *tmpptr++; - vfloat16m1_t _w0 = vle16_v_f16m1(kptr0, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, val0, _w0, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, val1, _w0, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, val2, _w0, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, val3, _w0, vl); - _sum4 = vfmacc_vf_f16m1(_sum4, val4, _w0, vl); - _sum5 = vfmacc_vf_f16m1(_sum5, val5, _w0, vl); - _sum6 = vfmacc_vf_f16m1(_sum6, val6, _w0, vl); - _sum7 = vfmacc_vf_f16m1(_sum7, val7, _w0, vl); + vfloat16m1_t _w0 = __riscv_vle16_v_f16m1(kptr0, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, val0, _w0, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, val1, _w0, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, val2, _w0, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, val3, _w0, vl); + _sum4 = __riscv_vfmacc_vf_f16m1(_sum4, val4, _w0, vl); + _sum5 = __riscv_vfmacc_vf_f16m1(_sum5, val5, _w0, vl); + _sum6 = __riscv_vfmacc_vf_f16m1(_sum6, val6, _w0, vl); + _sum7 = __riscv_vfmacc_vf_f16m1(_sum7, val7, _w0, vl); kptr0 += packn; } #if C906 - vsse16_v_f16m1(outptr0, top_blob.cstep * sizeof(__fp16), _sum0, vl); - vsse16_v_f16m1(outptr0 + 1, top_blob.cstep * sizeof(__fp16), _sum1, vl); - vsse16_v_f16m1(outptr0 + 2, top_blob.cstep * sizeof(__fp16), _sum2, vl); - vsse16_v_f16m1(outptr0 + 3, top_blob.cstep * sizeof(__fp16), _sum3, vl); - vsse16_v_f16m1(outptr0 + 4, top_blob.cstep * sizeof(__fp16), _sum4, vl); - vsse16_v_f16m1(outptr0 + 5, top_blob.cstep * sizeof(__fp16), _sum5, vl); - vsse16_v_f16m1(outptr0 + 6, top_blob.cstep * sizeof(__fp16), _sum6, vl); - vsse16_v_f16m1(outptr0 + 7, top_blob.cstep * sizeof(__fp16), _sum7, vl); + __riscv_vsse16_v_f16m1(outptr0, top_blob.cstep * sizeof(__fp16), _sum0, vl); + __riscv_vsse16_v_f16m1(outptr0 + 1, top_blob.cstep * sizeof(__fp16), _sum1, vl); + __riscv_vsse16_v_f16m1(outptr0 + 2, top_blob.cstep * sizeof(__fp16), _sum2, vl); + __riscv_vsse16_v_f16m1(outptr0 + 3, top_blob.cstep * sizeof(__fp16), _sum3, vl); + __riscv_vsse16_v_f16m1(outptr0 + 4, top_blob.cstep * sizeof(__fp16), _sum4, vl); + __riscv_vsse16_v_f16m1(outptr0 + 5, top_blob.cstep * sizeof(__fp16), _sum5, vl); + __riscv_vsse16_v_f16m1(outptr0 + 6, top_blob.cstep * sizeof(__fp16), _sum6, vl); + __riscv_vsse16_v_f16m1(outptr0 + 7, top_blob.cstep * sizeof(__fp16), _sum7, vl); #else - vssseg8e16_v_f16m1(outptr0, top_blob.cstep * sizeof(__fp16), _sum0, _sum1, _sum2, _sum3, _sum4, _sum5, _sum6, _sum7, vl); + __riscv_vssseg8e16_v_f16m1x8(outptr0, top_blob.cstep * sizeof(__fp16), __riscv_vcreate_v_f16m1x8(_sum0, _sum1, _sum2, _sum3, _sum4, _sum5, _sum6, _sum7), vl); #endif outptr0 += 8; } @@ -273,10 +273,10 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_ int nn = inch * maxk * packn; // inch always > 0 - vfloat16m1_t _sum0 = vle16_v_f16m1(biasptr, vl); - vfloat16m1_t _sum1 = vle16_v_f16m1(biasptr, vl); - vfloat16m1_t _sum2 = vle16_v_f16m1(biasptr, vl); - vfloat16m1_t _sum3 = vle16_v_f16m1(biasptr, vl); + vfloat16m1_t _sum0 = __riscv_vle16_v_f16m1(biasptr, vl); + vfloat16m1_t _sum1 = __riscv_vle16_v_f16m1(biasptr, vl); + vfloat16m1_t _sum2 = __riscv_vle16_v_f16m1(biasptr, vl); + vfloat16m1_t _sum3 = __riscv_vle16_v_f16m1(biasptr, vl); for (int j = 0; j < nn; j++) { @@ -284,22 +284,22 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_ __fp16 val1 = *tmpptr++; __fp16 val2 = *tmpptr++; __fp16 val3 = *tmpptr++; - vfloat16m1_t _w0 = vle16_v_f16m1(kptr0, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, val0, _w0, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, val1, _w0, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, val2, _w0, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, val3, _w0, vl); + vfloat16m1_t _w0 = __riscv_vle16_v_f16m1(kptr0, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, val0, _w0, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, val1, _w0, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, val2, _w0, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, val3, _w0, vl); kptr0 += packn; } #if C906 - vsse16_v_f16m1(outptr0, top_blob.cstep * sizeof(__fp16), _sum0, vl); - vsse16_v_f16m1(outptr0 + 1, top_blob.cstep * sizeof(__fp16), _sum1, vl); - vsse16_v_f16m1(outptr0 + 2, top_blob.cstep * sizeof(__fp16), _sum2, vl); - vsse16_v_f16m1(outptr0 + 3, top_blob.cstep * sizeof(__fp16), _sum3, vl); + __riscv_vsse16_v_f16m1(outptr0, top_blob.cstep * sizeof(__fp16), _sum0, vl); + __riscv_vsse16_v_f16m1(outptr0 + 1, top_blob.cstep * sizeof(__fp16), _sum1, vl); + __riscv_vsse16_v_f16m1(outptr0 + 2, top_blob.cstep * sizeof(__fp16), _sum2, vl); + __riscv_vsse16_v_f16m1(outptr0 + 3, top_blob.cstep * sizeof(__fp16), _sum3, vl); #else - vssseg4e16_v_f16m1(outptr0, top_blob.cstep * sizeof(__fp16), _sum0, _sum1, _sum2, _sum3, vl); + __riscv_vssseg4e16_v_f16m1x4(outptr0, top_blob.cstep * sizeof(__fp16), __riscv_vcreate_v_f16m1x4(_sum0, _sum1, _sum2, _sum3), vl); #endif outptr0 += 4; } @@ -310,25 +310,25 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_ int nn = inch * maxk * packn; // inch always > 0 - vfloat16m1_t _sum0 = vle16_v_f16m1(biasptr, vl); - vfloat16m1_t _sum1 = vle16_v_f16m1(biasptr, vl); + vfloat16m1_t _sum0 = __riscv_vle16_v_f16m1(biasptr, vl); + vfloat16m1_t _sum1 = __riscv_vle16_v_f16m1(biasptr, vl); for (int j = 0; j < nn; j++) { __fp16 val0 = *tmpptr++; __fp16 val1 = *tmpptr++; - vfloat16m1_t _w0 = vle16_v_f16m1(kptr0, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, val0, _w0, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, val1, _w0, vl); + vfloat16m1_t _w0 = __riscv_vle16_v_f16m1(kptr0, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, val0, _w0, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, val1, _w0, vl); kptr0 += packn; } #if C906 - vsse16_v_f16m1(outptr0, top_blob.cstep * sizeof(__fp16), _sum0, vl); - vsse16_v_f16m1(outptr0 + 1, top_blob.cstep * sizeof(__fp16), _sum1, vl); + __riscv_vsse16_v_f16m1(outptr0, top_blob.cstep * sizeof(__fp16), _sum0, vl); + __riscv_vsse16_v_f16m1(outptr0 + 1, top_blob.cstep * sizeof(__fp16), _sum1, vl); #else - vssseg2e16_v_f16m1(outptr0, top_blob.cstep * sizeof(__fp16), _sum0, _sum1, vl); + __riscv_vssseg2e16_v_f16m1x2(outptr0, top_blob.cstep * sizeof(__fp16), __riscv_vcreate_v_f16m1x2(_sum0, _sum1), vl); #endif outptr0 += 2; } @@ -339,18 +339,18 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_ int nn = inch * maxk * packn; // inch always > 0 - vfloat16m1_t _sum = vle16_v_f16m1(biasptr, vl); + vfloat16m1_t _sum = __riscv_vle16_v_f16m1(biasptr, vl); for (int j = 0; j < nn; j++) { __fp16 val = *tmpptr++; - vfloat16m1_t _w0 = vle16_v_f16m1(kptr0, vl); - _sum = vfmacc_vf_f16m1(_sum, val, _w0, vl); + vfloat16m1_t _w0 = __riscv_vle16_v_f16m1(kptr0, vl); + _sum = __riscv_vfmacc_vf_f16m1(_sum, val, _w0, vl); kptr0 += packn; } - vsse16_v_f16m1(outptr0, top_blob.cstep * sizeof(__fp16), _sum, vl); + __riscv_vsse16_v_f16m1(outptr0, top_blob.cstep * sizeof(__fp16), _sum, vl); outptr0 += 1; } @@ -383,35 +383,27 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_ __fp16 sum6 = bias0; __fp16 sum7 = bias0; - vfloat16m1_t _sum0 = vfmv_v_f_f16m1(0.f, vl); - vfloat16m1_t _sum1 = vfmv_v_f_f16m1(0.f, vl); - vfloat16m1_t _sum2 = vfmv_v_f_f16m1(0.f, vl); - vfloat16m1_t _sum3 = vfmv_v_f_f16m1(0.f, vl); - vfloat16m1_t _sum4 = vfmv_v_f_f16m1(0.f, vl); - vfloat16m1_t _sum5 = vfmv_v_f_f16m1(0.f, vl); - vfloat16m1_t _sum6 = vfmv_v_f_f16m1(0.f, vl); - vfloat16m1_t _sum7 = vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _sum0 = __riscv_vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _sum1 = __riscv_vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _sum2 = __riscv_vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _sum3 = __riscv_vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _sum4 = __riscv_vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _sum5 = __riscv_vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _sum6 = __riscv_vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _sum7 = __riscv_vfmv_v_f_f16m1(0.f, vl); for (int j = 0; j < nn; j++) { - vfloat16m1_t _val0; - vfloat16m1_t _val1; - vfloat16m1_t _val2; - vfloat16m1_t _val3; - vfloat16m1_t _val4; - vfloat16m1_t _val5; - vfloat16m1_t _val6; - vfloat16m1_t _val7; - vlseg8e16_v_f16m1(&_val0, &_val1, &_val2, &_val3, &_val4, &_val5, &_val6, &_val7, tmpptr, vl); - vfloat16m1_t _w0 = vle16_v_f16m1(kptr0, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, _val0, _w0, vl); - _sum1 = vfmacc_vv_f16m1(_sum1, _val1, _w0, vl); - _sum2 = vfmacc_vv_f16m1(_sum2, _val2, _w0, vl); - _sum3 = vfmacc_vv_f16m1(_sum3, _val3, _w0, vl); - _sum4 = vfmacc_vv_f16m1(_sum4, _val4, _w0, vl); - _sum5 = vfmacc_vv_f16m1(_sum5, _val5, _w0, vl); - _sum6 = vfmacc_vv_f16m1(_sum6, _val6, _w0, vl); - _sum7 = vfmacc_vv_f16m1(_sum7, _val7, _w0, vl); + vfloat16m1x8_t _val = __riscv_vlseg8e16_v_f16m1x8(tmpptr, vl); + vfloat16m1_t _w0 = __riscv_vle16_v_f16m1(kptr0, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, __riscv_vget_v_f16m1x8_f16m1(_val, 0), _w0, vl); + _sum1 = __riscv_vfmacc_vv_f16m1(_sum1, __riscv_vget_v_f16m1x8_f16m1(_val, 1), _w0, vl); + _sum2 = __riscv_vfmacc_vv_f16m1(_sum2, __riscv_vget_v_f16m1x8_f16m1(_val, 2), _w0, vl); + _sum3 = __riscv_vfmacc_vv_f16m1(_sum3, __riscv_vget_v_f16m1x8_f16m1(_val, 3), _w0, vl); + _sum4 = __riscv_vfmacc_vv_f16m1(_sum4, __riscv_vget_v_f16m1x8_f16m1(_val, 4), _w0, vl); + _sum5 = __riscv_vfmacc_vv_f16m1(_sum5, __riscv_vget_v_f16m1x8_f16m1(_val, 5), _w0, vl); + _sum6 = __riscv_vfmacc_vv_f16m1(_sum6, __riscv_vget_v_f16m1x8_f16m1(_val, 6), _w0, vl); + _sum7 = __riscv_vfmacc_vv_f16m1(_sum7, __riscv_vget_v_f16m1x8_f16m1(_val, 7), _w0, vl); tmpptr += packn * 8; kptr0 += packn; } @@ -426,14 +418,14 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_ std::vector<__fp16> ss5(packn); std::vector<__fp16> ss6(packn); std::vector<__fp16> ss7(packn); - vse16_v_f16m1((__fp16*)ss0.data(), _sum0, vl); - vse16_v_f16m1((__fp16*)ss1.data(), _sum1, vl); - vse16_v_f16m1((__fp16*)ss2.data(), _sum2, vl); - vse16_v_f16m1((__fp16*)ss3.data(), _sum3, vl); - vse16_v_f16m1((__fp16*)ss4.data(), _sum4, vl); - vse16_v_f16m1((__fp16*)ss5.data(), _sum5, vl); - vse16_v_f16m1((__fp16*)ss6.data(), _sum6, vl); - vse16_v_f16m1((__fp16*)ss7.data(), _sum7, vl); + __riscv_vse16_v_f16m1((__fp16*)ss0.data(), _sum0, vl); + __riscv_vse16_v_f16m1((__fp16*)ss1.data(), _sum1, vl); + __riscv_vse16_v_f16m1((__fp16*)ss2.data(), _sum2, vl); + __riscv_vse16_v_f16m1((__fp16*)ss3.data(), _sum3, vl); + __riscv_vse16_v_f16m1((__fp16*)ss4.data(), _sum4, vl); + __riscv_vse16_v_f16m1((__fp16*)ss5.data(), _sum5, vl); + __riscv_vse16_v_f16m1((__fp16*)ss6.data(), _sum6, vl); + __riscv_vse16_v_f16m1((__fp16*)ss7.data(), _sum7, vl); for (int i = 0; i < packn; i++) { sum0 += ss0[i]; @@ -446,14 +438,14 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_ sum7 += ss7[i]; } #else - sum0 = vfmv_f_s_f16m1_f16(vfredusum_vs_f16m1_f16m1(vfloat16m1_t(), _sum0, vfmv_s_f_f16m1(vfloat16m1_t(), sum0, vl), vl)); - sum1 = vfmv_f_s_f16m1_f16(vfredusum_vs_f16m1_f16m1(vfloat16m1_t(), _sum1, vfmv_s_f_f16m1(vfloat16m1_t(), sum1, vl), vl)); - sum2 = vfmv_f_s_f16m1_f16(vfredusum_vs_f16m1_f16m1(vfloat16m1_t(), _sum2, vfmv_s_f_f16m1(vfloat16m1_t(), sum2, vl), vl)); - sum3 = vfmv_f_s_f16m1_f16(vfredusum_vs_f16m1_f16m1(vfloat16m1_t(), _sum3, vfmv_s_f_f16m1(vfloat16m1_t(), sum3, vl), vl)); - sum4 = vfmv_f_s_f16m1_f16(vfredusum_vs_f16m1_f16m1(vfloat16m1_t(), _sum4, vfmv_s_f_f16m1(vfloat16m1_t(), sum4, vl), vl)); - sum5 = vfmv_f_s_f16m1_f16(vfredusum_vs_f16m1_f16m1(vfloat16m1_t(), _sum5, vfmv_s_f_f16m1(vfloat16m1_t(), sum5, vl), vl)); - sum6 = vfmv_f_s_f16m1_f16(vfredusum_vs_f16m1_f16m1(vfloat16m1_t(), _sum6, vfmv_s_f_f16m1(vfloat16m1_t(), sum6, vl), vl)); - sum7 = vfmv_f_s_f16m1_f16(vfredusum_vs_f16m1_f16m1(vfloat16m1_t(), _sum7, vfmv_s_f_f16m1(vfloat16m1_t(), sum7, vl), vl)); + sum0 = __riscv_vfmv_f_s_f16m1_f16(__riscv_vfredusum_vs_f16m1_f16m1(_sum0, __riscv_vfmv_s_f_f16m1(sum0, vl), vl)); + sum1 = __riscv_vfmv_f_s_f16m1_f16(__riscv_vfredusum_vs_f16m1_f16m1(_sum1, __riscv_vfmv_s_f_f16m1(sum1, vl), vl)); + sum2 = __riscv_vfmv_f_s_f16m1_f16(__riscv_vfredusum_vs_f16m1_f16m1(_sum2, __riscv_vfmv_s_f_f16m1(sum2, vl), vl)); + sum3 = __riscv_vfmv_f_s_f16m1_f16(__riscv_vfredusum_vs_f16m1_f16m1(_sum3, __riscv_vfmv_s_f_f16m1(sum3, vl), vl)); + sum4 = __riscv_vfmv_f_s_f16m1_f16(__riscv_vfredusum_vs_f16m1_f16m1(_sum4, __riscv_vfmv_s_f_f16m1(sum4, vl), vl)); + sum5 = __riscv_vfmv_f_s_f16m1_f16(__riscv_vfredusum_vs_f16m1_f16m1(_sum5, __riscv_vfmv_s_f_f16m1(sum5, vl), vl)); + sum6 = __riscv_vfmv_f_s_f16m1_f16(__riscv_vfredusum_vs_f16m1_f16m1(_sum6, __riscv_vfmv_s_f_f16m1(sum6, vl), vl)); + sum7 = __riscv_vfmv_f_s_f16m1_f16(__riscv_vfredusum_vs_f16m1_f16m1(_sum7, __riscv_vfmv_s_f_f16m1(sum7, vl), vl)); #endif outptr0[0] = sum0; @@ -479,24 +471,19 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_ __fp16 sum2 = bias0; __fp16 sum3 = bias0; - vfloat16m1_t _sum0 = vfmv_v_f_f16m1(0.f, vl); - vfloat16m1_t _sum1 = vfmv_v_f_f16m1(0.f, vl); - vfloat16m1_t _sum2 = vfmv_v_f_f16m1(0.f, vl); - vfloat16m1_t _sum3 = vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _sum0 = __riscv_vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _sum1 = __riscv_vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _sum2 = __riscv_vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _sum3 = __riscv_vfmv_v_f_f16m1(0.f, vl); for (int j = 0; j < nn; j++) { - vfloat16m1_t _val0; - vfloat16m1_t _val1; - vfloat16m1_t _val2; - vfloat16m1_t _val3; - - vlseg4e16_v_f16m1(&_val0, &_val1, &_val2, &_val3, tmpptr, vl); - vfloat16m1_t _w0 = vle16_v_f16m1(kptr0, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, _val0, _w0, vl); - _sum1 = vfmacc_vv_f16m1(_sum1, _val1, _w0, vl); - _sum2 = vfmacc_vv_f16m1(_sum2, _val2, _w0, vl); - _sum3 = vfmacc_vv_f16m1(_sum3, _val3, _w0, vl); + vfloat16m1x4_t _val = __riscv_vlseg4e16_v_f16m1x4(tmpptr, vl); + vfloat16m1_t _w0 = __riscv_vle16_v_f16m1(kptr0, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, __riscv_vget_v_f16m1x4_f16m1(_val, 0), _w0, vl); + _sum1 = __riscv_vfmacc_vv_f16m1(_sum1, __riscv_vget_v_f16m1x4_f16m1(_val, 1), _w0, vl); + _sum2 = __riscv_vfmacc_vv_f16m1(_sum2, __riscv_vget_v_f16m1x4_f16m1(_val, 2), _w0, vl); + _sum3 = __riscv_vfmacc_vv_f16m1(_sum3, __riscv_vget_v_f16m1x4_f16m1(_val, 3), _w0, vl); tmpptr += packn * 4; kptr0 += packn; } @@ -507,10 +494,10 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_ std::vector<__fp16> ss1(packn); std::vector<__fp16> ss2(packn); std::vector<__fp16> ss3(packn); - vse16_v_f16m1((__fp16*)ss0.data(), _sum0, vl); - vse16_v_f16m1((__fp16*)ss1.data(), _sum1, vl); - vse16_v_f16m1((__fp16*)ss2.data(), _sum2, vl); - vse16_v_f16m1((__fp16*)ss3.data(), _sum3, vl); + __riscv_vse16_v_f16m1((__fp16*)ss0.data(), _sum0, vl); + __riscv_vse16_v_f16m1((__fp16*)ss1.data(), _sum1, vl); + __riscv_vse16_v_f16m1((__fp16*)ss2.data(), _sum2, vl); + __riscv_vse16_v_f16m1((__fp16*)ss3.data(), _sum3, vl); for (int i = 0; i < packn; i++) { sum0 += ss0[i]; @@ -519,10 +506,10 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_ sum3 += ss3[i]; } #else - sum0 = vfmv_f_s_f16m1_f16(vfredusum_vs_f16m1_f16m1(vfloat16m1_t(), _sum0, vfmv_s_f_f16m1(vfloat16m1_t(), sum0, vl), vl)); - sum1 = vfmv_f_s_f16m1_f16(vfredusum_vs_f16m1_f16m1(vfloat16m1_t(), _sum1, vfmv_s_f_f16m1(vfloat16m1_t(), sum1, vl), vl)); - sum2 = vfmv_f_s_f16m1_f16(vfredusum_vs_f16m1_f16m1(vfloat16m1_t(), _sum2, vfmv_s_f_f16m1(vfloat16m1_t(), sum2, vl), vl)); - sum3 = vfmv_f_s_f16m1_f16(vfredusum_vs_f16m1_f16m1(vfloat16m1_t(), _sum3, vfmv_s_f_f16m1(vfloat16m1_t(), sum3, vl), vl)); + sum0 = __riscv_vfmv_f_s_f16m1_f16(__riscv_vfredusum_vs_f16m1_f16m1(_sum0, __riscv_vfmv_s_f_f16m1(sum0, vl), vl)); + sum1 = __riscv_vfmv_f_s_f16m1_f16(__riscv_vfredusum_vs_f16m1_f16m1(_sum1, __riscv_vfmv_s_f_f16m1(sum1, vl), vl)); + sum2 = __riscv_vfmv_f_s_f16m1_f16(__riscv_vfredusum_vs_f16m1_f16m1(_sum2, __riscv_vfmv_s_f_f16m1(sum2, vl), vl)); + sum3 = __riscv_vfmv_f_s_f16m1_f16(__riscv_vfredusum_vs_f16m1_f16m1(_sum3, __riscv_vfmv_s_f_f16m1(sum3, vl), vl)); #endif outptr0[0] = sum0; @@ -542,17 +529,15 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_ __fp16 sum0 = bias0; __fp16 sum1 = bias0; - vfloat16m1_t _sum0 = vfmv_v_f_f16m1(0.f, vl); - vfloat16m1_t _sum1 = vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _sum0 = __riscv_vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _sum1 = __riscv_vfmv_v_f_f16m1(0.f, vl); for (int j = 0; j < nn; j++) { - vfloat16m1_t _val0; - vfloat16m1_t _val1; - vlseg2e16_v_f16m1(&_val0, &_val1, tmpptr, vl); - vfloat16m1_t _w0 = vle16_v_f16m1(kptr0, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, _val0, _w0, vl); - _sum1 = vfmacc_vv_f16m1(_sum1, _val1, _w0, vl); + vfloat16m1x2_t _val = __riscv_vlseg2e16_v_f16m1x2(tmpptr, vl); + vfloat16m1_t _w0 = __riscv_vle16_v_f16m1(kptr0, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, __riscv_vget_v_f16m1x2_f16m1(_val, 0), _w0, vl); + _sum1 = __riscv_vfmacc_vv_f16m1(_sum1, __riscv_vget_v_f16m1x2_f16m1(_val, 1), _w0, vl); tmpptr += packn * 2; kptr0 += packn; } @@ -561,16 +546,16 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_ // TODO std::vector<__fp16> ss0(packn); std::vector<__fp16> ss1(packn); - vse16_v_f16m1((__fp16*)ss0.data(), _sum0, vl); - vse16_v_f16m1((__fp16*)ss1.data(), _sum1, vl); + __riscv_vse16_v_f16m1((__fp16*)ss0.data(), _sum0, vl); + __riscv_vse16_v_f16m1((__fp16*)ss1.data(), _sum1, vl); for (int i = 0; i < packn; i++) { sum0 += ss0[i]; sum1 += ss1[i]; } #else - sum0 = vfmv_f_s_f16m1_f16(vfredusum_vs_f16m1_f16m1(vfloat16m1_t(), _sum0, vfmv_s_f_f16m1(vfloat16m1_t(), sum0, vl), vl)); - sum1 = vfmv_f_s_f16m1_f16(vfredusum_vs_f16m1_f16m1(vfloat16m1_t(), _sum1, vfmv_s_f_f16m1(vfloat16m1_t(), sum1, vl), vl)); + sum0 = __riscv_vfmv_f_s_f16m1_f16(__riscv_vfredusum_vs_f16m1_f16m1(_sum0, __riscv_vfmv_s_f_f16m1(sum0, vl), vl)); + sum1 = __riscv_vfmv_f_s_f16m1_f16(__riscv_vfredusum_vs_f16m1_f16m1(_sum1, __riscv_vfmv_s_f_f16m1(sum1, vl), vl)); #endif outptr0[0] = sum0; @@ -587,13 +572,13 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_ __fp16 sum0 = bias0; - vfloat16m1_t _sum0 = vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _sum0 = __riscv_vfmv_v_f_f16m1(0.f, vl); for (int j = 0; j < nn; j++) { - vfloat16m1_t _val0 = vle16_v_f16m1(tmpptr, vl); - vfloat16m1_t _w0 = vle16_v_f16m1(kptr0, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, _val0, _w0, vl); + vfloat16m1_t _val0 = __riscv_vle16_v_f16m1(tmpptr, vl); + vfloat16m1_t _w0 = __riscv_vle16_v_f16m1(kptr0, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _val0, _w0, vl); tmpptr += packn; kptr0 += packn; } @@ -601,13 +586,13 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_ #if C906 // TODO std::vector<__fp16> ss0(packn); - vse16_v_f16m1((__fp16*)ss0.data(), _sum0, vl); + __riscv_vse16_v_f16m1((__fp16*)ss0.data(), _sum0, vl); for (int i = 0; i < packn; i++) { sum0 += ss0[i]; } #else - sum0 = vfmv_f_s_f16m1_f16(vfredusum_vs_f16m1_f16m1(vfloat16m1_t(), _sum0, vfmv_s_f_f16m1(vfloat16m1_t(), sum0, vl), vl)); + sum0 = __riscv_vfmv_f_s_f16m1_f16(__riscv_vfredusum_vs_f16m1_f16m1(_sum0, __riscv_vfmv_s_f_f16m1(sum0, vl), vl)); #endif outptr0[0] = sum0; @@ -678,7 +663,7 @@ static void convolution_im2col_sgemm_transform_kernel_packnto1_fp16sa_rvv(const static void convolution_im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt) { const int packn = csrr_vlenb() / 2; - const size_t vl = vsetvl_e16m1(packn); + const size_t vl = __riscv_vsetvl_e16m1(packn); int w = bottom_blob.w; int inch = bottom_blob.c; @@ -711,8 +696,8 @@ static void convolution_im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_blob, int j = 0; for (; j < outw; j++) { - vfloat16m1_t _val = vle16_v_f16m1(sptr, vl); - vse16_v_f16m1(ptr, _val, vl); + vfloat16m1_t _val = __riscv_vle16_v_f16m1(sptr, vl); + __riscv_vse16_v_f16m1(ptr, _val, vl); sptr += stride_w * packn; ptr += packn; diff --git a/src/layer/riscv/convolution_winograd_dot.h b/src/layer/riscv/convolution_winograd_dot.h index c0a7b7680f8..9670bb8c727 100644 --- a/src/layer/riscv/convolution_winograd_dot.h +++ b/src/layer/riscv/convolution_winograd_dot.h @@ -16,7 +16,7 @@ static void convolution_winograd_dot_rvv(Mat& bottom_blob_tm, int outch, const M { #if __riscv_vector const int packn = csrr_vlenb() / 4; - const size_t vl = vsetvl_e32m1(packn); + const size_t vl = __riscv_vsetvl_e32m1(packn); #endif // Mat bottom_blob_tm(tiles, 16/36/64, inch, 4u, opt.workspace_allocator); @@ -55,7 +55,7 @@ static void convolution_winograd_dot_rvv(Mat& bottom_blob_tm, int outch, const M for (int q = 0; q < inch; q++) { - vse32_v_f32m1(tmpptr, vle32_v_f32m1(r0, vl), vl); + __riscv_vse32_v_f32m1(tmpptr, __riscv_vle32_v_f32m1(r0, vl), vl); r0 += bottom_blob_tm.cstep; tmpptr += packn; } @@ -140,39 +140,39 @@ static void convolution_winograd_dot_rvv(Mat& bottom_blob_tm, int outch, const M int nn = inch; // inch always > 0 - vfloat32m1_t _sum0 = vfmv_v_f_f32m1(0.f, vl); - vfloat32m1_t _sum1 = vfmv_v_f_f32m1(0.f, vl); - vfloat32m1_t _sum2 = vfmv_v_f_f32m1(0.f, vl); - vfloat32m1_t _sum3 = vfmv_v_f_f32m1(0.f, vl); - vfloat32m1_t _sum4 = vfmv_v_f_f32m1(0.f, vl); - vfloat32m1_t _sum5 = vfmv_v_f_f32m1(0.f, vl); - vfloat32m1_t _sum6 = vfmv_v_f_f32m1(0.f, vl); - vfloat32m1_t _sum7 = vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _sum0 = __riscv_vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _sum1 = __riscv_vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _sum2 = __riscv_vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _sum3 = __riscv_vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _sum4 = __riscv_vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _sum5 = __riscv_vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _sum6 = __riscv_vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _sum7 = __riscv_vfmv_v_f_f32m1(0.f, vl); int j = 0; for (; j < nn; j++) { - vfloat32m1_t _val = vle32_v_f32m1(r0, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, k0[0], _val, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, k0[1], _val, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, k0[2], _val, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, k0[3], _val, vl); - _sum4 = vfmacc_vf_f32m1(_sum4, k0[4], _val, vl); - _sum5 = vfmacc_vf_f32m1(_sum5, k0[5], _val, vl); - _sum6 = vfmacc_vf_f32m1(_sum6, k0[6], _val, vl); - _sum7 = vfmacc_vf_f32m1(_sum7, k0[7], _val, vl); + vfloat32m1_t _val = __riscv_vle32_v_f32m1(r0, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, k0[0], _val, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, k0[1], _val, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, k0[2], _val, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, k0[3], _val, vl); + _sum4 = __riscv_vfmacc_vf_f32m1(_sum4, k0[4], _val, vl); + _sum5 = __riscv_vfmacc_vf_f32m1(_sum5, k0[5], _val, vl); + _sum6 = __riscv_vfmacc_vf_f32m1(_sum6, k0[6], _val, vl); + _sum7 = __riscv_vfmacc_vf_f32m1(_sum7, k0[7], _val, vl); r0 += packn; k0 += 8; } - vse32_v_f32m1(output0_tm, _sum0, vl); - vse32_v_f32m1(output1_tm, _sum1, vl); - vse32_v_f32m1(output2_tm, _sum2, vl); - vse32_v_f32m1(output3_tm, _sum3, vl); - vse32_v_f32m1(output4_tm, _sum4, vl); - vse32_v_f32m1(output5_tm, _sum5, vl); - vse32_v_f32m1(output6_tm, _sum6, vl); - vse32_v_f32m1(output7_tm, _sum7, vl); + __riscv_vse32_v_f32m1(output0_tm, _sum0, vl); + __riscv_vse32_v_f32m1(output1_tm, _sum1, vl); + __riscv_vse32_v_f32m1(output2_tm, _sum2, vl); + __riscv_vse32_v_f32m1(output3_tm, _sum3, vl); + __riscv_vse32_v_f32m1(output4_tm, _sum4, vl); + __riscv_vse32_v_f32m1(output5_tm, _sum5, vl); + __riscv_vse32_v_f32m1(output6_tm, _sum6, vl); + __riscv_vse32_v_f32m1(output7_tm, _sum7, vl); output0_tm += packn; output1_tm += packn; @@ -262,27 +262,27 @@ static void convolution_winograd_dot_rvv(Mat& bottom_blob_tm, int outch, const M int nn = inch; // inch always > 0 - vfloat32m1_t _sum0 = vfmv_v_f_f32m1(0.f, vl); - vfloat32m1_t _sum1 = vfmv_v_f_f32m1(0.f, vl); - vfloat32m1_t _sum2 = vfmv_v_f_f32m1(0.f, vl); - vfloat32m1_t _sum3 = vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _sum0 = __riscv_vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _sum1 = __riscv_vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _sum2 = __riscv_vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _sum3 = __riscv_vfmv_v_f_f32m1(0.f, vl); int j = 0; for (; j < nn; j++) { - vfloat32m1_t _val = vle32_v_f32m1(r0, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, k0[0], _val, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, k0[1], _val, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, k0[2], _val, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, k0[3], _val, vl); + vfloat32m1_t _val = __riscv_vle32_v_f32m1(r0, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, k0[0], _val, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, k0[1], _val, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, k0[2], _val, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, k0[3], _val, vl); r0 += packn; k0 += 4; } - vse32_v_f32m1(output0_tm, _sum0, vl); - vse32_v_f32m1(output1_tm, _sum1, vl); - vse32_v_f32m1(output2_tm, _sum2, vl); - vse32_v_f32m1(output3_tm, _sum3, vl); + __riscv_vse32_v_f32m1(output0_tm, _sum0, vl); + __riscv_vse32_v_f32m1(output1_tm, _sum1, vl); + __riscv_vse32_v_f32m1(output2_tm, _sum2, vl); + __riscv_vse32_v_f32m1(output3_tm, _sum3, vl); output0_tm += packn; output1_tm += packn; @@ -444,16 +444,16 @@ static void convolution_winograd_dot_rvv(Mat& bottom_blob_tm, int outch, const M int nn = inch; // inch always > 0 - vfloat32m1_t _sum0 = vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _sum0 = __riscv_vfmv_v_f_f32m1(0.f, vl); for (int j = 0; j < nn; j++) { - _sum0 = vfmacc_vf_f32m1(_sum0, k0[0], vle32_v_f32m1(r0, vl), vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, k0[0], __riscv_vle32_v_f32m1(r0, vl), vl); r0 += packn; k0++; } - vse32_v_f32m1(output0_tm, _sum0, vl); + __riscv_vse32_v_f32m1(output0_tm, _sum0, vl); output0_tm += packn; } #else // __riscv_vector diff --git a/src/layer/riscv/convolution_winograd_dot_packn.h b/src/layer/riscv/convolution_winograd_dot_packn.h index 1c505d5c2e1..84c38aaecf8 100644 --- a/src/layer/riscv/convolution_winograd_dot_packn.h +++ b/src/layer/riscv/convolution_winograd_dot_packn.h @@ -15,7 +15,7 @@ static void convolution_winograd_dot_packn_rvv(Mat& bottom_blob_tm, int outch, const Mat& kernel_tm, Mat& top_blob_tm, const Option& opt) { const int packn = csrr_vlenb() / 4; - const size_t vl = vsetvl_e32m1(packn); + const size_t vl = __riscv_vsetvl_e32m1(packn); // Mat bottom_blob_tm(tiles, 16/36/64, inch, 4u * packn, packn, opt.workspace_allocator); @@ -67,15 +67,15 @@ static void convolution_winograd_dot_packn_rvv(Mat& bottom_blob_tm, int outch, c r0 += bottom_blob_tm.cstep * packn; #else - vfloat32m1_t _val0 = vle32_v_f32m1(r0, vl); - vfloat32m1_t _val1 = vle32_v_f32m1(r0 + packn, vl); - vfloat32m1_t _val2 = vle32_v_f32m1(r0 + packn * 2, vl); - vfloat32m1_t _val3 = vle32_v_f32m1(r0 + packn * 3, vl); - vfloat32m1_t _val4 = vle32_v_f32m1(r0 + packn * 4, vl); - vfloat32m1_t _val5 = vle32_v_f32m1(r0 + packn * 5, vl); - vfloat32m1_t _val6 = vle32_v_f32m1(r0 + packn * 6, vl); - vfloat32m1_t _val7 = vle32_v_f32m1(r0 + packn * 7, vl); - vsseg8e32_v_f32m1(tmpptr, _val0, _val1, _val2, _val3, _val4, _val5, _val6, _val7, vl); + vfloat32m1_t _val0 = __riscv_vle32_v_f32m1(r0, vl); + vfloat32m1_t _val1 = __riscv_vle32_v_f32m1(r0 + packn, vl); + vfloat32m1_t _val2 = __riscv_vle32_v_f32m1(r0 + packn * 2, vl); + vfloat32m1_t _val3 = __riscv_vle32_v_f32m1(r0 + packn * 3, vl); + vfloat32m1_t _val4 = __riscv_vle32_v_f32m1(r0 + packn * 4, vl); + vfloat32m1_t _val5 = __riscv_vle32_v_f32m1(r0 + packn * 5, vl); + vfloat32m1_t _val6 = __riscv_vle32_v_f32m1(r0 + packn * 6, vl); + vfloat32m1_t _val7 = __riscv_vle32_v_f32m1(r0 + packn * 7, vl); + __riscv_vsseg8e32_v_f32m1x8(tmpptr, __riscv_vcreate_v_f32m1x8(_val0, _val1, _val2, _val3, _val4, _val5, _val6, _val7), vl); r0 += bottom_blob_tm.cstep * packn; tmpptr += packn * 8; @@ -104,11 +104,11 @@ static void convolution_winograd_dot_packn_rvv(Mat& bottom_blob_tm, int outch, c r0 += bottom_blob_tm.cstep * packn; #else - vfloat32m1_t _val0 = vle32_v_f32m1(r0, vl); - vfloat32m1_t _val1 = vle32_v_f32m1(r0 + packn, vl); - vfloat32m1_t _val2 = vle32_v_f32m1(r0 + packn * 2, vl); - vfloat32m1_t _val3 = vle32_v_f32m1(r0 + packn * 3, vl); - vsseg4e32_v_f32m1(tmpptr, _val0, _val1, _val2, _val3, vl); + vfloat32m1_t _val0 = __riscv_vle32_v_f32m1(r0, vl); + vfloat32m1_t _val1 = __riscv_vle32_v_f32m1(r0 + packn, vl); + vfloat32m1_t _val2 = __riscv_vle32_v_f32m1(r0 + packn * 2, vl); + vfloat32m1_t _val3 = __riscv_vle32_v_f32m1(r0 + packn * 3, vl); + __riscv_vsseg4e32_v_f32m1x4(tmpptr, __riscv_vcreate_v_f32m1x4(_val0, _val1, _val2, _val3), vl); r0 += bottom_blob_tm.cstep * packn; tmpptr += packn * 4; @@ -135,9 +135,9 @@ static void convolution_winograd_dot_packn_rvv(Mat& bottom_blob_tm, int outch, c r0 += bottom_blob_tm.cstep * packn; #else - vfloat32m1_t _val0 = vle32_v_f32m1(r0, vl); - vfloat32m1_t _val1 = vle32_v_f32m1(r0 + packn, vl); - vsseg2e32_v_f32m1(tmpptr, _val0, _val1, vl); + vfloat32m1_t _val0 = __riscv_vle32_v_f32m1(r0, vl); + vfloat32m1_t _val1 = __riscv_vle32_v_f32m1(r0 + packn, vl); + __riscv_vsseg2e32_v_f32m1x2(tmpptr, __riscv_vcreate_v_f32m1x2(_val0, _val1), vl); r0 += bottom_blob_tm.cstep * packn; tmpptr += packn * 2; @@ -154,8 +154,8 @@ static void convolution_winograd_dot_packn_rvv(Mat& bottom_blob_tm, int outch, c for (int q = 0; q < inch; q++) { - vfloat32m1_t _val = vle32_v_f32m1(r0, vl); - vse32_v_f32m1(tmpptr, _val, vl); + vfloat32m1_t _val = __riscv_vle32_v_f32m1(r0, vl); + __riscv_vse32_v_f32m1(tmpptr, _val, vl); r0 += bottom_blob_tm.cstep * packn; tmpptr += packn; @@ -187,14 +187,14 @@ static void convolution_winograd_dot_packn_rvv(Mat& bottom_blob_tm, int outch, c int nn = inch * packn; // inch always > 0 - vfloat32m1_t _sum0 = vfmv_v_f_f32m1(0.f, vl); - vfloat32m1_t _sum1 = vfmv_v_f_f32m1(0.f, vl); - vfloat32m1_t _sum2 = vfmv_v_f_f32m1(0.f, vl); - vfloat32m1_t _sum3 = vfmv_v_f_f32m1(0.f, vl); - vfloat32m1_t _sum4 = vfmv_v_f_f32m1(0.f, vl); - vfloat32m1_t _sum5 = vfmv_v_f_f32m1(0.f, vl); - vfloat32m1_t _sum6 = vfmv_v_f_f32m1(0.f, vl); - vfloat32m1_t _sum7 = vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _sum0 = __riscv_vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _sum1 = __riscv_vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _sum2 = __riscv_vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _sum3 = __riscv_vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _sum4 = __riscv_vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _sum5 = __riscv_vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _sum6 = __riscv_vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _sum7 = __riscv_vfmv_v_f_f32m1(0.f, vl); for (int j = 0; j < nn; j++) { @@ -206,27 +206,27 @@ static void convolution_winograd_dot_packn_rvv(Mat& bottom_blob_tm, int outch, c float val5 = *r0++; float val6 = *r0++; float val7 = *r0++; - vfloat32m1_t _w0 = vle32_v_f32m1(k0, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, val0, _w0, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, val1, _w0, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, val2, _w0, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, val3, _w0, vl); - _sum4 = vfmacc_vf_f32m1(_sum4, val4, _w0, vl); - _sum5 = vfmacc_vf_f32m1(_sum5, val5, _w0, vl); - _sum6 = vfmacc_vf_f32m1(_sum6, val6, _w0, vl); - _sum7 = vfmacc_vf_f32m1(_sum7, val7, _w0, vl); + vfloat32m1_t _w0 = __riscv_vle32_v_f32m1(k0, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, val0, _w0, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, val1, _w0, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, val2, _w0, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, val3, _w0, vl); + _sum4 = __riscv_vfmacc_vf_f32m1(_sum4, val4, _w0, vl); + _sum5 = __riscv_vfmacc_vf_f32m1(_sum5, val5, _w0, vl); + _sum6 = __riscv_vfmacc_vf_f32m1(_sum6, val6, _w0, vl); + _sum7 = __riscv_vfmacc_vf_f32m1(_sum7, val7, _w0, vl); k0 += packn; } - vse32_v_f32m1(output0_tm, _sum0, vl); - vse32_v_f32m1(output0_tm + packn, _sum1, vl); - vse32_v_f32m1(output0_tm + packn * 2, _sum2, vl); - vse32_v_f32m1(output0_tm + packn * 3, _sum3, vl); - vse32_v_f32m1(output0_tm + packn * 4, _sum4, vl); - vse32_v_f32m1(output0_tm + packn * 5, _sum5, vl); - vse32_v_f32m1(output0_tm + packn * 6, _sum6, vl); - vse32_v_f32m1(output0_tm + packn * 7, _sum7, vl); + __riscv_vse32_v_f32m1(output0_tm, _sum0, vl); + __riscv_vse32_v_f32m1(output0_tm + packn, _sum1, vl); + __riscv_vse32_v_f32m1(output0_tm + packn * 2, _sum2, vl); + __riscv_vse32_v_f32m1(output0_tm + packn * 3, _sum3, vl); + __riscv_vse32_v_f32m1(output0_tm + packn * 4, _sum4, vl); + __riscv_vse32_v_f32m1(output0_tm + packn * 5, _sum5, vl); + __riscv_vse32_v_f32m1(output0_tm + packn * 6, _sum6, vl); + __riscv_vse32_v_f32m1(output0_tm + packn * 7, _sum7, vl); output0_tm += packn * 8; } @@ -237,10 +237,10 @@ static void convolution_winograd_dot_packn_rvv(Mat& bottom_blob_tm, int outch, c int nn = inch * packn; // inch always > 0 - vfloat32m1_t _sum0 = vfmv_v_f_f32m1(0.f, vl); - vfloat32m1_t _sum1 = vfmv_v_f_f32m1(0.f, vl); - vfloat32m1_t _sum2 = vfmv_v_f_f32m1(0.f, vl); - vfloat32m1_t _sum3 = vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _sum0 = __riscv_vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _sum1 = __riscv_vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _sum2 = __riscv_vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _sum3 = __riscv_vfmv_v_f_f32m1(0.f, vl); for (int j = 0; j < nn; j++) { @@ -248,19 +248,19 @@ static void convolution_winograd_dot_packn_rvv(Mat& bottom_blob_tm, int outch, c float val1 = *r0++; float val2 = *r0++; float val3 = *r0++; - vfloat32m1_t _w0 = vle32_v_f32m1(k0, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, val0, _w0, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, val1, _w0, vl); - _sum2 = vfmacc_vf_f32m1(_sum2, val2, _w0, vl); - _sum3 = vfmacc_vf_f32m1(_sum3, val3, _w0, vl); + vfloat32m1_t _w0 = __riscv_vle32_v_f32m1(k0, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, val0, _w0, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, val1, _w0, vl); + _sum2 = __riscv_vfmacc_vf_f32m1(_sum2, val2, _w0, vl); + _sum3 = __riscv_vfmacc_vf_f32m1(_sum3, val3, _w0, vl); k0 += packn; } - vse32_v_f32m1(output0_tm, _sum0, vl); - vse32_v_f32m1(output0_tm + packn, _sum1, vl); - vse32_v_f32m1(output0_tm + packn * 2, _sum2, vl); - vse32_v_f32m1(output0_tm + packn * 3, _sum3, vl); + __riscv_vse32_v_f32m1(output0_tm, _sum0, vl); + __riscv_vse32_v_f32m1(output0_tm + packn, _sum1, vl); + __riscv_vse32_v_f32m1(output0_tm + packn * 2, _sum2, vl); + __riscv_vse32_v_f32m1(output0_tm + packn * 3, _sum3, vl); output0_tm += packn * 4; } @@ -271,22 +271,22 @@ static void convolution_winograd_dot_packn_rvv(Mat& bottom_blob_tm, int outch, c int nn = inch * packn; // inch always > 0 - vfloat32m1_t _sum0 = vfmv_v_f_f32m1(0.f, vl); - vfloat32m1_t _sum1 = vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _sum0 = __riscv_vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _sum1 = __riscv_vfmv_v_f_f32m1(0.f, vl); for (int j = 0; j < nn; j++) { float val0 = *r0++; float val1 = *r0++; - vfloat32m1_t _w0 = vle32_v_f32m1(k0, vl); - _sum0 = vfmacc_vf_f32m1(_sum0, val0, _w0, vl); - _sum1 = vfmacc_vf_f32m1(_sum1, val1, _w0, vl); + vfloat32m1_t _w0 = __riscv_vle32_v_f32m1(k0, vl); + _sum0 = __riscv_vfmacc_vf_f32m1(_sum0, val0, _w0, vl); + _sum1 = __riscv_vfmacc_vf_f32m1(_sum1, val1, _w0, vl); k0 += packn; } - vse32_v_f32m1(output0_tm, _sum0, vl); - vse32_v_f32m1(output0_tm + packn, _sum1, vl); + __riscv_vse32_v_f32m1(output0_tm, _sum0, vl); + __riscv_vse32_v_f32m1(output0_tm + packn, _sum1, vl); output0_tm += packn * 2; } @@ -297,18 +297,18 @@ static void convolution_winograd_dot_packn_rvv(Mat& bottom_blob_tm, int outch, c int nn = inch * packn; // inch always > 0 - vfloat32m1_t _sum = vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _sum = __riscv_vfmv_v_f_f32m1(0.f, vl); for (int j = 0; j < nn; j++) { float val = *r0++; - vfloat32m1_t _w0 = vle32_v_f32m1(k0, vl); - _sum = vfmacc_vf_f32m1(_sum, val, _w0, vl); + vfloat32m1_t _w0 = __riscv_vle32_v_f32m1(k0, vl); + _sum = __riscv_vfmacc_vf_f32m1(_sum, val, _w0, vl); k0 += packn; } - vse32_v_f32m1(output0_tm, _sum, vl); + __riscv_vse32_v_f32m1(output0_tm, _sum, vl); output0_tm += packn; } diff --git a/src/layer/riscv/convolution_winograd_dot_packn_fp16s.h b/src/layer/riscv/convolution_winograd_dot_packn_fp16s.h index ed35ad3e378..dd3e0b9afc7 100644 --- a/src/layer/riscv/convolution_winograd_dot_packn_fp16s.h +++ b/src/layer/riscv/convolution_winograd_dot_packn_fp16s.h @@ -15,7 +15,7 @@ static void convolution_winograd_dot_packn_fp16sa_rvv(Mat& bottom_blob_tm, int outch, const Mat& kernel_tm, Mat& top_blob_tm, const Option& opt) { const int packn = csrr_vlenb() / 2; - const size_t vl = vsetvl_e16m1(packn); + const size_t vl = __riscv_vsetvl_e16m1(packn); // Mat bottom_blob_tm(tiles, 16/36/64, inch, 2u * packn, packn, opt.workspace_allocator); @@ -67,15 +67,15 @@ static void convolution_winograd_dot_packn_fp16sa_rvv(Mat& bottom_blob_tm, int o r0 += bottom_blob_tm.cstep * packn; #else - vfloat16m1_t _val0 = vle16_v_f16m1(r0, vl); - vfloat16m1_t _val1 = vle16_v_f16m1(r0 + packn, vl); - vfloat16m1_t _val2 = vle16_v_f16m1(r0 + packn * 2, vl); - vfloat16m1_t _val3 = vle16_v_f16m1(r0 + packn * 3, vl); - vfloat16m1_t _val4 = vle16_v_f16m1(r0 + packn * 4, vl); - vfloat16m1_t _val5 = vle16_v_f16m1(r0 + packn * 5, vl); - vfloat16m1_t _val6 = vle16_v_f16m1(r0 + packn * 6, vl); - vfloat16m1_t _val7 = vle16_v_f16m1(r0 + packn * 7, vl); - vsseg8e16_v_f16m1(tmpptr, _val0, _val1, _val2, _val3, _val4, _val5, _val6, _val7, vl); + vfloat16m1_t _val0 = __riscv_vle16_v_f16m1(r0, vl); + vfloat16m1_t _val1 = __riscv_vle16_v_f16m1(r0 + packn, vl); + vfloat16m1_t _val2 = __riscv_vle16_v_f16m1(r0 + packn * 2, vl); + vfloat16m1_t _val3 = __riscv_vle16_v_f16m1(r0 + packn * 3, vl); + vfloat16m1_t _val4 = __riscv_vle16_v_f16m1(r0 + packn * 4, vl); + vfloat16m1_t _val5 = __riscv_vle16_v_f16m1(r0 + packn * 5, vl); + vfloat16m1_t _val6 = __riscv_vle16_v_f16m1(r0 + packn * 6, vl); + vfloat16m1_t _val7 = __riscv_vle16_v_f16m1(r0 + packn * 7, vl); + __riscv_vsseg8e16_v_f16m1x8(tmpptr, __riscv_vcreate_v_f16m1x8(_val0, _val1, _val2, _val3, _val4, _val5, _val6, _val7), vl); r0 += bottom_blob_tm.cstep * packn; tmpptr += packn * 8; @@ -104,11 +104,11 @@ static void convolution_winograd_dot_packn_fp16sa_rvv(Mat& bottom_blob_tm, int o r0 += bottom_blob_tm.cstep * packn; #else - vfloat16m1_t _val0 = vle16_v_f16m1(r0, vl); - vfloat16m1_t _val1 = vle16_v_f16m1(r0 + packn, vl); - vfloat16m1_t _val2 = vle16_v_f16m1(r0 + packn * 2, vl); - vfloat16m1_t _val3 = vle16_v_f16m1(r0 + packn * 3, vl); - vsseg4e16_v_f16m1(tmpptr, _val0, _val1, _val2, _val3, vl); + vfloat16m1_t _val0 = __riscv_vle16_v_f16m1(r0, vl); + vfloat16m1_t _val1 = __riscv_vle16_v_f16m1(r0 + packn, vl); + vfloat16m1_t _val2 = __riscv_vle16_v_f16m1(r0 + packn * 2, vl); + vfloat16m1_t _val3 = __riscv_vle16_v_f16m1(r0 + packn * 3, vl); + __riscv_vsseg4e16_v_f16m1x4(tmpptr, __riscv_vcreate_v_f16m1x4(_val0, _val1, _val2, _val3), vl); r0 += bottom_blob_tm.cstep * packn; tmpptr += packn * 4; @@ -135,9 +135,9 @@ static void convolution_winograd_dot_packn_fp16sa_rvv(Mat& bottom_blob_tm, int o r0 += bottom_blob_tm.cstep * packn; #else - vfloat16m1_t _val0 = vle16_v_f16m1(r0, vl); - vfloat16m1_t _val1 = vle16_v_f16m1(r0 + packn, vl); - vsseg2e16_v_f16m1(tmpptr, _val0, _val1, vl); + vfloat16m1_t _val0 = __riscv_vle16_v_f16m1(r0, vl); + vfloat16m1_t _val1 = __riscv_vle16_v_f16m1(r0 + packn, vl); + __riscv_vsseg2e16_v_f16m1x2(tmpptr, __riscv_vcreate_v_f16m1x2(_val0, _val1), vl); r0 += bottom_blob_tm.cstep * packn; tmpptr += packn * 2; @@ -154,8 +154,8 @@ static void convolution_winograd_dot_packn_fp16sa_rvv(Mat& bottom_blob_tm, int o for (int q = 0; q < inch; q++) { - vfloat16m1_t _val = vle16_v_f16m1(r0, vl); - vse16_v_f16m1(tmpptr, _val, vl); + vfloat16m1_t _val = __riscv_vle16_v_f16m1(r0, vl); + __riscv_vse16_v_f16m1(tmpptr, _val, vl); r0 += bottom_blob_tm.cstep * packn; tmpptr += packn; @@ -187,14 +187,14 @@ static void convolution_winograd_dot_packn_fp16sa_rvv(Mat& bottom_blob_tm, int o int nn = inch * packn; // inch always > 0 - vfloat16m1_t _sum0 = vfmv_v_f_f16m1(0.f, vl); - vfloat16m1_t _sum1 = vfmv_v_f_f16m1(0.f, vl); - vfloat16m1_t _sum2 = vfmv_v_f_f16m1(0.f, vl); - vfloat16m1_t _sum3 = vfmv_v_f_f16m1(0.f, vl); - vfloat16m1_t _sum4 = vfmv_v_f_f16m1(0.f, vl); - vfloat16m1_t _sum5 = vfmv_v_f_f16m1(0.f, vl); - vfloat16m1_t _sum6 = vfmv_v_f_f16m1(0.f, vl); - vfloat16m1_t _sum7 = vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _sum0 = __riscv_vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _sum1 = __riscv_vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _sum2 = __riscv_vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _sum3 = __riscv_vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _sum4 = __riscv_vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _sum5 = __riscv_vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _sum6 = __riscv_vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _sum7 = __riscv_vfmv_v_f_f16m1(0.f, vl); for (int j = 0; j < nn; j++) { @@ -206,27 +206,27 @@ static void convolution_winograd_dot_packn_fp16sa_rvv(Mat& bottom_blob_tm, int o __fp16 val5 = *r0++; __fp16 val6 = *r0++; __fp16 val7 = *r0++; - vfloat16m1_t _w0 = vle16_v_f16m1(k0, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, val0, _w0, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, val1, _w0, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, val2, _w0, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, val3, _w0, vl); - _sum4 = vfmacc_vf_f16m1(_sum4, val4, _w0, vl); - _sum5 = vfmacc_vf_f16m1(_sum5, val5, _w0, vl); - _sum6 = vfmacc_vf_f16m1(_sum6, val6, _w0, vl); - _sum7 = vfmacc_vf_f16m1(_sum7, val7, _w0, vl); + vfloat16m1_t _w0 = __riscv_vle16_v_f16m1(k0, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, val0, _w0, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, val1, _w0, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, val2, _w0, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, val3, _w0, vl); + _sum4 = __riscv_vfmacc_vf_f16m1(_sum4, val4, _w0, vl); + _sum5 = __riscv_vfmacc_vf_f16m1(_sum5, val5, _w0, vl); + _sum6 = __riscv_vfmacc_vf_f16m1(_sum6, val6, _w0, vl); + _sum7 = __riscv_vfmacc_vf_f16m1(_sum7, val7, _w0, vl); k0 += packn; } - vse16_v_f16m1(output0_tm, _sum0, vl); - vse16_v_f16m1(output0_tm + packn, _sum1, vl); - vse16_v_f16m1(output0_tm + packn * 2, _sum2, vl); - vse16_v_f16m1(output0_tm + packn * 3, _sum3, vl); - vse16_v_f16m1(output0_tm + packn * 4, _sum4, vl); - vse16_v_f16m1(output0_tm + packn * 5, _sum5, vl); - vse16_v_f16m1(output0_tm + packn * 6, _sum6, vl); - vse16_v_f16m1(output0_tm + packn * 7, _sum7, vl); + __riscv_vse16_v_f16m1(output0_tm, _sum0, vl); + __riscv_vse16_v_f16m1(output0_tm + packn, _sum1, vl); + __riscv_vse16_v_f16m1(output0_tm + packn * 2, _sum2, vl); + __riscv_vse16_v_f16m1(output0_tm + packn * 3, _sum3, vl); + __riscv_vse16_v_f16m1(output0_tm + packn * 4, _sum4, vl); + __riscv_vse16_v_f16m1(output0_tm + packn * 5, _sum5, vl); + __riscv_vse16_v_f16m1(output0_tm + packn * 6, _sum6, vl); + __riscv_vse16_v_f16m1(output0_tm + packn * 7, _sum7, vl); output0_tm += packn * 8; } @@ -237,10 +237,10 @@ static void convolution_winograd_dot_packn_fp16sa_rvv(Mat& bottom_blob_tm, int o int nn = inch * packn; // inch always > 0 - vfloat16m1_t _sum0 = vfmv_v_f_f16m1(0.f, vl); - vfloat16m1_t _sum1 = vfmv_v_f_f16m1(0.f, vl); - vfloat16m1_t _sum2 = vfmv_v_f_f16m1(0.f, vl); - vfloat16m1_t _sum3 = vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _sum0 = __riscv_vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _sum1 = __riscv_vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _sum2 = __riscv_vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _sum3 = __riscv_vfmv_v_f_f16m1(0.f, vl); for (int j = 0; j < nn; j++) { @@ -248,19 +248,19 @@ static void convolution_winograd_dot_packn_fp16sa_rvv(Mat& bottom_blob_tm, int o __fp16 val1 = *r0++; __fp16 val2 = *r0++; __fp16 val3 = *r0++; - vfloat16m1_t _w0 = vle16_v_f16m1(k0, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, val0, _w0, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, val1, _w0, vl); - _sum2 = vfmacc_vf_f16m1(_sum2, val2, _w0, vl); - _sum3 = vfmacc_vf_f16m1(_sum3, val3, _w0, vl); + vfloat16m1_t _w0 = __riscv_vle16_v_f16m1(k0, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, val0, _w0, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, val1, _w0, vl); + _sum2 = __riscv_vfmacc_vf_f16m1(_sum2, val2, _w0, vl); + _sum3 = __riscv_vfmacc_vf_f16m1(_sum3, val3, _w0, vl); k0 += packn; } - vse16_v_f16m1(output0_tm, _sum0, vl); - vse16_v_f16m1(output0_tm + packn, _sum1, vl); - vse16_v_f16m1(output0_tm + packn * 2, _sum2, vl); - vse16_v_f16m1(output0_tm + packn * 3, _sum3, vl); + __riscv_vse16_v_f16m1(output0_tm, _sum0, vl); + __riscv_vse16_v_f16m1(output0_tm + packn, _sum1, vl); + __riscv_vse16_v_f16m1(output0_tm + packn * 2, _sum2, vl); + __riscv_vse16_v_f16m1(output0_tm + packn * 3, _sum3, vl); output0_tm += packn * 4; } @@ -271,22 +271,22 @@ static void convolution_winograd_dot_packn_fp16sa_rvv(Mat& bottom_blob_tm, int o int nn = inch * packn; // inch always > 0 - vfloat16m1_t _sum0 = vfmv_v_f_f16m1(0.f, vl); - vfloat16m1_t _sum1 = vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _sum0 = __riscv_vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _sum1 = __riscv_vfmv_v_f_f16m1(0.f, vl); for (int j = 0; j < nn; j++) { __fp16 val0 = *r0++; __fp16 val1 = *r0++; - vfloat16m1_t _w0 = vle16_v_f16m1(k0, vl); - _sum0 = vfmacc_vf_f16m1(_sum0, val0, _w0, vl); - _sum1 = vfmacc_vf_f16m1(_sum1, val1, _w0, vl); + vfloat16m1_t _w0 = __riscv_vle16_v_f16m1(k0, vl); + _sum0 = __riscv_vfmacc_vf_f16m1(_sum0, val0, _w0, vl); + _sum1 = __riscv_vfmacc_vf_f16m1(_sum1, val1, _w0, vl); k0 += packn; } - vse16_v_f16m1(output0_tm, _sum0, vl); - vse16_v_f16m1(output0_tm + packn, _sum1, vl); + __riscv_vse16_v_f16m1(output0_tm, _sum0, vl); + __riscv_vse16_v_f16m1(output0_tm + packn, _sum1, vl); output0_tm += packn * 2; } @@ -297,18 +297,18 @@ static void convolution_winograd_dot_packn_fp16sa_rvv(Mat& bottom_blob_tm, int o int nn = inch * packn; // inch always > 0 - vfloat16m1_t _sum = vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1(0.f, vl); for (int j = 0; j < nn; j++) { __fp16 val = *r0++; - vfloat16m1_t _w0 = vle16_v_f16m1(k0, vl); - _sum = vfmacc_vf_f16m1(_sum, val, _w0, vl); + vfloat16m1_t _w0 = __riscv_vle16_v_f16m1(k0, vl); + _sum = __riscv_vfmacc_vf_f16m1(_sum, val, _w0, vl); k0 += packn; } - vse16_v_f16m1(output0_tm, _sum, vl); + __riscv_vse16_v_f16m1(output0_tm, _sum, vl); output0_tm += packn; } diff --git a/src/layer/riscv/convolution_winograd_transform_packn.h b/src/layer/riscv/convolution_winograd_transform_packn.h index 874796af9d4..9c97d184f48 100644 --- a/src/layer/riscv/convolution_winograd_transform_packn.h +++ b/src/layer/riscv/convolution_winograd_transform_packn.h @@ -15,7 +15,7 @@ static void conv3x3s1_winograd63_transform_input_packn_rvv(const Mat& bottom_blob, Mat& bottom_blob_tm, const Option& opt) { const int packn = csrr_vlenb() / 4; - const size_t vl = vsetvl_e32m1(packn); + const size_t vl = __riscv_vsetvl_e32m1(packn); const int w = bottom_blob.w; const int h = bottom_blob.h; @@ -72,43 +72,43 @@ static void conv3x3s1_winograd63_transform_input_packn_rvv(const Mat& bottom_blo for (int m = 0; m < 8; m++) { - vfloat32m1_t _r00 = vle32_v_f32m1(r0, vl); - vfloat32m1_t _r01 = vle32_v_f32m1(r0 + packn, vl); - vfloat32m1_t _r02 = vle32_v_f32m1(r0 + packn * 2, vl); - vfloat32m1_t _r03 = vle32_v_f32m1(r0 + packn * 3, vl); - vfloat32m1_t _r04 = vle32_v_f32m1(r0 + packn * 4, vl); - vfloat32m1_t _r05 = vle32_v_f32m1(r0 + packn * 5, vl); - vfloat32m1_t _r06 = vle32_v_f32m1(r0 + packn * 6, vl); - vfloat32m1_t _r07 = vle32_v_f32m1(r0 + packn * 7, vl); - - vfloat32m1_t _tmp0m = vfmacc_vf_f32m1(vfsub_vv_f32m1(_r00, _r06, vl), 5.25f, vfsub_vv_f32m1(_r04, _r02, vl), vl); - vfloat32m1_t _tmp7m = vfmacc_vf_f32m1(vfsub_vv_f32m1(_r07, _r01, vl), 5.25f, vfsub_vv_f32m1(_r03, _r05, vl), vl); - vse32_v_f32m1(tmp[0][m], _tmp0m, vl); - vse32_v_f32m1(tmp[7][m], _tmp7m, vl); - - vfloat32m1_t _tmp12a = vfmacc_vf_f32m1(vfadd_vv_f32m1(_r02, _r06, vl), -4.25f, _r04, vl); - vfloat32m1_t _tmp12b = vfmacc_vf_f32m1(vfadd_vv_f32m1(_r01, _r05, vl), -4.25f, _r03, vl); - - vfloat32m1_t _tmp1m = vfadd_vv_f32m1(_tmp12a, _tmp12b, vl); - vfloat32m1_t _tmp2m = vfsub_vv_f32m1(_tmp12a, _tmp12b, vl); - vse32_v_f32m1(tmp[1][m], _tmp1m, vl); - vse32_v_f32m1(tmp[2][m], _tmp2m, vl); - - vfloat32m1_t _tmp34a = vfmacc_vf_f32m1(vfmacc_vf_f32m1(_r06, 0.25f, _r02, vl), -1.25f, _r04, vl); - vfloat32m1_t _tmp34b = vfmacc_vf_f32m1(vfmacc_vf_f32m1(vfmul_vf_f32m1(_r01, 0.5f, vl), -2.5f, _r03, vl), 2.f, _r05, vl); - - vfloat32m1_t _tmp3m = vfadd_vv_f32m1(_tmp34a, _tmp34b, vl); - vfloat32m1_t _tmp4m = vfsub_vv_f32m1(_tmp34a, _tmp34b, vl); - vse32_v_f32m1(tmp[3][m], _tmp3m, vl); - vse32_v_f32m1(tmp[4][m], _tmp4m, vl); - - vfloat32m1_t _tmp56a = vfmacc_vf_f32m1(_r06, 4.f, vfmacc_vf_f32m1(_r02, -1.25f, _r04, vl), vl); - vfloat32m1_t _tmp56b = vfmacc_vf_f32m1(vfmacc_vf_f32m1(vfmul_vf_f32m1(_r01, 2.f, vl), -2.5f, _r03, vl), 0.5f, _r05, vl); - - vfloat32m1_t _tmp5m = vfadd_vv_f32m1(_tmp56a, _tmp56b, vl); - vfloat32m1_t _tmp6m = vfsub_vv_f32m1(_tmp56a, _tmp56b, vl); - vse32_v_f32m1(tmp[5][m], _tmp5m, vl); - vse32_v_f32m1(tmp[6][m], _tmp6m, vl); + vfloat32m1_t _r00 = __riscv_vle32_v_f32m1(r0, vl); + vfloat32m1_t _r01 = __riscv_vle32_v_f32m1(r0 + packn, vl); + vfloat32m1_t _r02 = __riscv_vle32_v_f32m1(r0 + packn * 2, vl); + vfloat32m1_t _r03 = __riscv_vle32_v_f32m1(r0 + packn * 3, vl); + vfloat32m1_t _r04 = __riscv_vle32_v_f32m1(r0 + packn * 4, vl); + vfloat32m1_t _r05 = __riscv_vle32_v_f32m1(r0 + packn * 5, vl); + vfloat32m1_t _r06 = __riscv_vle32_v_f32m1(r0 + packn * 6, vl); + vfloat32m1_t _r07 = __riscv_vle32_v_f32m1(r0 + packn * 7, vl); + + vfloat32m1_t _tmp0m = __riscv_vfmacc_vf_f32m1(__riscv_vfsub_vv_f32m1(_r00, _r06, vl), 5.25f, __riscv_vfsub_vv_f32m1(_r04, _r02, vl), vl); + vfloat32m1_t _tmp7m = __riscv_vfmacc_vf_f32m1(__riscv_vfsub_vv_f32m1(_r07, _r01, vl), 5.25f, __riscv_vfsub_vv_f32m1(_r03, _r05, vl), vl); + __riscv_vse32_v_f32m1(tmp[0][m], _tmp0m, vl); + __riscv_vse32_v_f32m1(tmp[7][m], _tmp7m, vl); + + vfloat32m1_t _tmp12a = __riscv_vfmacc_vf_f32m1(__riscv_vfadd_vv_f32m1(_r02, _r06, vl), -4.25f, _r04, vl); + vfloat32m1_t _tmp12b = __riscv_vfmacc_vf_f32m1(__riscv_vfadd_vv_f32m1(_r01, _r05, vl), -4.25f, _r03, vl); + + vfloat32m1_t _tmp1m = __riscv_vfadd_vv_f32m1(_tmp12a, _tmp12b, vl); + vfloat32m1_t _tmp2m = __riscv_vfsub_vv_f32m1(_tmp12a, _tmp12b, vl); + __riscv_vse32_v_f32m1(tmp[1][m], _tmp1m, vl); + __riscv_vse32_v_f32m1(tmp[2][m], _tmp2m, vl); + + vfloat32m1_t _tmp34a = __riscv_vfmacc_vf_f32m1(__riscv_vfmacc_vf_f32m1(_r06, 0.25f, _r02, vl), -1.25f, _r04, vl); + vfloat32m1_t _tmp34b = __riscv_vfmacc_vf_f32m1(__riscv_vfmacc_vf_f32m1(__riscv_vfmul_vf_f32m1(_r01, 0.5f, vl), -2.5f, _r03, vl), 2.f, _r05, vl); + + vfloat32m1_t _tmp3m = __riscv_vfadd_vv_f32m1(_tmp34a, _tmp34b, vl); + vfloat32m1_t _tmp4m = __riscv_vfsub_vv_f32m1(_tmp34a, _tmp34b, vl); + __riscv_vse32_v_f32m1(tmp[3][m], _tmp3m, vl); + __riscv_vse32_v_f32m1(tmp[4][m], _tmp4m, vl); + + vfloat32m1_t _tmp56a = __riscv_vfmacc_vf_f32m1(_r06, 4.f, __riscv_vfmacc_vf_f32m1(_r02, -1.25f, _r04, vl), vl); + vfloat32m1_t _tmp56b = __riscv_vfmacc_vf_f32m1(__riscv_vfmacc_vf_f32m1(__riscv_vfmul_vf_f32m1(_r01, 2.f, vl), -2.5f, _r03, vl), 0.5f, _r05, vl); + + vfloat32m1_t _tmp5m = __riscv_vfadd_vv_f32m1(_tmp56a, _tmp56b, vl); + vfloat32m1_t _tmp6m = __riscv_vfsub_vv_f32m1(_tmp56a, _tmp56b, vl); + __riscv_vse32_v_f32m1(tmp[5][m], _tmp5m, vl); + __riscv_vse32_v_f32m1(tmp[6][m], _tmp6m, vl); r0 += w * packn; } @@ -124,44 +124,44 @@ static void conv3x3s1_winograd63_transform_input_packn_rvv(const Mat& bottom_blo for (int m = 0; m < 8; m++) { - vfloat32m1_t _tmp00 = vle32_v_f32m1(tmp[m][0], vl); - vfloat32m1_t _tmp01 = vle32_v_f32m1(tmp[m][1], vl); - vfloat32m1_t _tmp02 = vle32_v_f32m1(tmp[m][2], vl); - vfloat32m1_t _tmp03 = vle32_v_f32m1(tmp[m][3], vl); - vfloat32m1_t _tmp04 = vle32_v_f32m1(tmp[m][4], vl); - vfloat32m1_t _tmp05 = vle32_v_f32m1(tmp[m][5], vl); - vfloat32m1_t _tmp06 = vle32_v_f32m1(tmp[m][6], vl); - vfloat32m1_t _tmp07 = vle32_v_f32m1(tmp[m][7], vl); + vfloat32m1_t _tmp00 = __riscv_vle32_v_f32m1(tmp[m][0], vl); + vfloat32m1_t _tmp01 = __riscv_vle32_v_f32m1(tmp[m][1], vl); + vfloat32m1_t _tmp02 = __riscv_vle32_v_f32m1(tmp[m][2], vl); + vfloat32m1_t _tmp03 = __riscv_vle32_v_f32m1(tmp[m][3], vl); + vfloat32m1_t _tmp04 = __riscv_vle32_v_f32m1(tmp[m][4], vl); + vfloat32m1_t _tmp05 = __riscv_vle32_v_f32m1(tmp[m][5], vl); + vfloat32m1_t _tmp06 = __riscv_vle32_v_f32m1(tmp[m][6], vl); + vfloat32m1_t _tmp07 = __riscv_vle32_v_f32m1(tmp[m][7], vl); - vfloat32m1_t _r0tm0 = vfmacc_vf_f32m1(vfsub_vv_f32m1(_tmp00, _tmp06, vl), 5.25f, vfsub_vv_f32m1(_tmp04, _tmp02, vl), vl); - vfloat32m1_t _r0tm7 = vfmacc_vf_f32m1(vfsub_vv_f32m1(_tmp07, _tmp01, vl), 5.25f, vfsub_vv_f32m1(_tmp03, _tmp05, vl), vl); + vfloat32m1_t _r0tm0 = __riscv_vfmacc_vf_f32m1(__riscv_vfsub_vv_f32m1(_tmp00, _tmp06, vl), 5.25f, __riscv_vfsub_vv_f32m1(_tmp04, _tmp02, vl), vl); + vfloat32m1_t _r0tm7 = __riscv_vfmacc_vf_f32m1(__riscv_vfsub_vv_f32m1(_tmp07, _tmp01, vl), 5.25f, __riscv_vfsub_vv_f32m1(_tmp03, _tmp05, vl), vl); - vfloat32m1_t _tmp12a = vfmacc_vf_f32m1(vfadd_vv_f32m1(_tmp02, _tmp06, vl), -4.25f, _tmp04, vl); - vfloat32m1_t _tmp12b = vfmacc_vf_f32m1(vfadd_vv_f32m1(_tmp01, _tmp05, vl), -4.25f, _tmp03, vl); + vfloat32m1_t _tmp12a = __riscv_vfmacc_vf_f32m1(__riscv_vfadd_vv_f32m1(_tmp02, _tmp06, vl), -4.25f, _tmp04, vl); + vfloat32m1_t _tmp12b = __riscv_vfmacc_vf_f32m1(__riscv_vfadd_vv_f32m1(_tmp01, _tmp05, vl), -4.25f, _tmp03, vl); - vfloat32m1_t _r0tm1 = vfadd_vv_f32m1(_tmp12a, _tmp12b, vl); - vfloat32m1_t _r0tm2 = vfsub_vv_f32m1(_tmp12a, _tmp12b, vl); + vfloat32m1_t _r0tm1 = __riscv_vfadd_vv_f32m1(_tmp12a, _tmp12b, vl); + vfloat32m1_t _r0tm2 = __riscv_vfsub_vv_f32m1(_tmp12a, _tmp12b, vl); - vfloat32m1_t _tmp34a = vfmacc_vf_f32m1(vfmacc_vf_f32m1(_tmp06, 0.25f, _tmp02, vl), -1.25f, _tmp04, vl); - vfloat32m1_t _tmp34b = vfmacc_vf_f32m1(vfmacc_vf_f32m1(vfmul_vf_f32m1(_tmp01, 0.5f, vl), -2.5f, _tmp03, vl), 2.f, _tmp05, vl); + vfloat32m1_t _tmp34a = __riscv_vfmacc_vf_f32m1(__riscv_vfmacc_vf_f32m1(_tmp06, 0.25f, _tmp02, vl), -1.25f, _tmp04, vl); + vfloat32m1_t _tmp34b = __riscv_vfmacc_vf_f32m1(__riscv_vfmacc_vf_f32m1(__riscv_vfmul_vf_f32m1(_tmp01, 0.5f, vl), -2.5f, _tmp03, vl), 2.f, _tmp05, vl); - vfloat32m1_t _r0tm3 = vfadd_vv_f32m1(_tmp34a, _tmp34b, vl); - vfloat32m1_t _r0tm4 = vfsub_vv_f32m1(_tmp34a, _tmp34b, vl); + vfloat32m1_t _r0tm3 = __riscv_vfadd_vv_f32m1(_tmp34a, _tmp34b, vl); + vfloat32m1_t _r0tm4 = __riscv_vfsub_vv_f32m1(_tmp34a, _tmp34b, vl); - vfloat32m1_t _tmp56a = vfmacc_vf_f32m1(_tmp06, 4.f, vfmacc_vf_f32m1(_tmp02, -1.25f, _tmp04, vl), vl); - vfloat32m1_t _tmp56b = vfmacc_vf_f32m1(vfmacc_vf_f32m1(vfmul_vf_f32m1(_tmp01, 2.f, vl), -2.5f, _tmp03, vl), 0.5f, _tmp05, vl); + vfloat32m1_t _tmp56a = __riscv_vfmacc_vf_f32m1(_tmp06, 4.f, __riscv_vfmacc_vf_f32m1(_tmp02, -1.25f, _tmp04, vl), vl); + vfloat32m1_t _tmp56b = __riscv_vfmacc_vf_f32m1(__riscv_vfmacc_vf_f32m1(__riscv_vfmul_vf_f32m1(_tmp01, 2.f, vl), -2.5f, _tmp03, vl), 0.5f, _tmp05, vl); - vfloat32m1_t _r0tm5 = vfadd_vv_f32m1(_tmp56a, _tmp56b, vl); - vfloat32m1_t _r0tm6 = vfsub_vv_f32m1(_tmp56a, _tmp56b, vl); + vfloat32m1_t _r0tm5 = __riscv_vfadd_vv_f32m1(_tmp56a, _tmp56b, vl); + vfloat32m1_t _r0tm6 = __riscv_vfsub_vv_f32m1(_tmp56a, _tmp56b, vl); - vse32_v_f32m1(r0_tm_0, _r0tm0, vl); - vse32_v_f32m1(r0_tm_1, _r0tm1, vl); - vse32_v_f32m1(r0_tm_2, _r0tm2, vl); - vse32_v_f32m1(r0_tm_3, _r0tm3, vl); - vse32_v_f32m1(r0_tm_4, _r0tm4, vl); - vse32_v_f32m1(r0_tm_5, _r0tm5, vl); - vse32_v_f32m1(r0_tm_6, _r0tm6, vl); - vse32_v_f32m1(r0_tm_7, _r0tm7, vl); + __riscv_vse32_v_f32m1(r0_tm_0, _r0tm0, vl); + __riscv_vse32_v_f32m1(r0_tm_1, _r0tm1, vl); + __riscv_vse32_v_f32m1(r0_tm_2, _r0tm2, vl); + __riscv_vse32_v_f32m1(r0_tm_3, _r0tm3, vl); + __riscv_vse32_v_f32m1(r0_tm_4, _r0tm4, vl); + __riscv_vse32_v_f32m1(r0_tm_5, _r0tm5, vl); + __riscv_vse32_v_f32m1(r0_tm_6, _r0tm6, vl); + __riscv_vse32_v_f32m1(r0_tm_7, _r0tm7, vl); r0_tm_0 += tiles * packn * 8; r0_tm_1 += tiles * packn * 8; @@ -180,7 +180,7 @@ static void conv3x3s1_winograd63_transform_input_packn_rvv(const Mat& bottom_blo static void conv3x3s1_winograd63_transform_output_packn_rvv(const Mat& top_blob_tm, Mat& top_blob, const Mat& bias, const Option& opt) { const int packn = csrr_vlenb() / 4; - const size_t vl = vsetvl_e32m1(packn); + const size_t vl = __riscv_vsetvl_e32m1(packn); const int outw = top_blob.w; const int outh = top_blob.h; @@ -214,7 +214,7 @@ static void conv3x3s1_winograd63_transform_output_packn_rvv(const Mat& top_blob_ const Mat out0_tm = top_blob_tm.channel(p); Mat out0 = top_blob.channel(p); - vfloat32m1_t _bias0 = biasptr ? vle32_v_f32m1(biasptr + p * packn, vl) : vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _bias0 = biasptr ? __riscv_vle32_v_f32m1(biasptr + p * packn, vl) : __riscv_vfmv_v_f_f32m1(0.f, vl); // NOTE c99 variable length array float tmp[6][8][packn]; @@ -237,37 +237,37 @@ static void conv3x3s1_winograd63_transform_output_packn_rvv(const Mat& top_blob_ for (int m = 0; m < 8; m++) { - vfloat32m1_t _out0tm0 = vle32_v_f32m1(output0_tm_0, vl); - vfloat32m1_t _out0tm1 = vle32_v_f32m1(output0_tm_1, vl); - vfloat32m1_t _out0tm2 = vle32_v_f32m1(output0_tm_2, vl); - vfloat32m1_t _out0tm3 = vle32_v_f32m1(output0_tm_3, vl); - vfloat32m1_t _out0tm4 = vle32_v_f32m1(output0_tm_4, vl); - vfloat32m1_t _out0tm5 = vle32_v_f32m1(output0_tm_5, vl); - vfloat32m1_t _out0tm6 = vle32_v_f32m1(output0_tm_6, vl); - vfloat32m1_t _out0tm7 = vle32_v_f32m1(output0_tm_7, vl); - - vfloat32m1_t _tmp024a = vfadd_vv_f32m1(_out0tm1, _out0tm2, vl); - vfloat32m1_t _tmp135a = vfsub_vv_f32m1(_out0tm1, _out0tm2, vl); - - vfloat32m1_t _tmp024b = vfadd_vv_f32m1(_out0tm3, _out0tm4, vl); - vfloat32m1_t _tmp135b = vfsub_vv_f32m1(_out0tm3, _out0tm4, vl); - - vfloat32m1_t _tmp024c = vfadd_vv_f32m1(_out0tm5, _out0tm6, vl); - vfloat32m1_t _tmp135c = vfsub_vv_f32m1(_out0tm5, _out0tm6, vl); - - vfloat32m1_t _tmp0m = vfadd_vv_f32m1(vfadd_vv_f32m1(_out0tm0, _tmp024a, vl), vfmacc_vf_f32m1(_tmp024b, 32.f, _tmp024c, vl), vl); - vfloat32m1_t _tmp2m = vfmacc_vf_f32m1(vfmacc_vf_f32m1(_tmp024a, 4.f, _tmp024b, vl), 8.f, _tmp024c, vl); - vfloat32m1_t _tmp4m = vfmacc_vf_f32m1(vfmacc_vf_f32m1(_tmp024a, 16.f, _tmp024b, vl), 2.f, _tmp024c, vl); - vse32_v_f32m1(tmp[0][m], _tmp0m, vl); - vse32_v_f32m1(tmp[2][m], _tmp2m, vl); - vse32_v_f32m1(tmp[4][m], _tmp4m, vl); - - vfloat32m1_t _tmp1m = vfmacc_vf_f32m1(vfmacc_vf_f32m1(_tmp135a, 2.f, _tmp135b, vl), 16.f, _tmp135c, vl); - vfloat32m1_t _tmp3m = vfmacc_vf_f32m1(vfmacc_vf_f32m1(_tmp135a, 8.f, _tmp135b, vl), 4.f, _tmp135c, vl); - vfloat32m1_t _tmp5m = vfadd_vv_f32m1(vfadd_vv_f32m1(_out0tm7, _tmp135a, vl), vfmacc_vf_f32m1(_tmp135c, 32.f, _tmp135b, vl), vl); - vse32_v_f32m1(tmp[1][m], _tmp1m, vl); - vse32_v_f32m1(tmp[3][m], _tmp3m, vl); - vse32_v_f32m1(tmp[5][m], _tmp5m, vl); + vfloat32m1_t _out0tm0 = __riscv_vle32_v_f32m1(output0_tm_0, vl); + vfloat32m1_t _out0tm1 = __riscv_vle32_v_f32m1(output0_tm_1, vl); + vfloat32m1_t _out0tm2 = __riscv_vle32_v_f32m1(output0_tm_2, vl); + vfloat32m1_t _out0tm3 = __riscv_vle32_v_f32m1(output0_tm_3, vl); + vfloat32m1_t _out0tm4 = __riscv_vle32_v_f32m1(output0_tm_4, vl); + vfloat32m1_t _out0tm5 = __riscv_vle32_v_f32m1(output0_tm_5, vl); + vfloat32m1_t _out0tm6 = __riscv_vle32_v_f32m1(output0_tm_6, vl); + vfloat32m1_t _out0tm7 = __riscv_vle32_v_f32m1(output0_tm_7, vl); + + vfloat32m1_t _tmp024a = __riscv_vfadd_vv_f32m1(_out0tm1, _out0tm2, vl); + vfloat32m1_t _tmp135a = __riscv_vfsub_vv_f32m1(_out0tm1, _out0tm2, vl); + + vfloat32m1_t _tmp024b = __riscv_vfadd_vv_f32m1(_out0tm3, _out0tm4, vl); + vfloat32m1_t _tmp135b = __riscv_vfsub_vv_f32m1(_out0tm3, _out0tm4, vl); + + vfloat32m1_t _tmp024c = __riscv_vfadd_vv_f32m1(_out0tm5, _out0tm6, vl); + vfloat32m1_t _tmp135c = __riscv_vfsub_vv_f32m1(_out0tm5, _out0tm6, vl); + + vfloat32m1_t _tmp0m = __riscv_vfadd_vv_f32m1(__riscv_vfadd_vv_f32m1(_out0tm0, _tmp024a, vl), __riscv_vfmacc_vf_f32m1(_tmp024b, 32.f, _tmp024c, vl), vl); + vfloat32m1_t _tmp2m = __riscv_vfmacc_vf_f32m1(__riscv_vfmacc_vf_f32m1(_tmp024a, 4.f, _tmp024b, vl), 8.f, _tmp024c, vl); + vfloat32m1_t _tmp4m = __riscv_vfmacc_vf_f32m1(__riscv_vfmacc_vf_f32m1(_tmp024a, 16.f, _tmp024b, vl), 2.f, _tmp024c, vl); + __riscv_vse32_v_f32m1(tmp[0][m], _tmp0m, vl); + __riscv_vse32_v_f32m1(tmp[2][m], _tmp2m, vl); + __riscv_vse32_v_f32m1(tmp[4][m], _tmp4m, vl); + + vfloat32m1_t _tmp1m = __riscv_vfmacc_vf_f32m1(__riscv_vfmacc_vf_f32m1(_tmp135a, 2.f, _tmp135b, vl), 16.f, _tmp135c, vl); + vfloat32m1_t _tmp3m = __riscv_vfmacc_vf_f32m1(__riscv_vfmacc_vf_f32m1(_tmp135a, 8.f, _tmp135b, vl), 4.f, _tmp135c, vl); + vfloat32m1_t _tmp5m = __riscv_vfadd_vv_f32m1(__riscv_vfadd_vv_f32m1(_out0tm7, _tmp135a, vl), __riscv_vfmacc_vf_f32m1(_tmp135c, 32.f, _tmp135b, vl), vl); + __riscv_vse32_v_f32m1(tmp[1][m], _tmp1m, vl); + __riscv_vse32_v_f32m1(tmp[3][m], _tmp3m, vl); + __riscv_vse32_v_f32m1(tmp[5][m], _tmp5m, vl); output0_tm_0 += tiles * packn * 8; output0_tm_1 += tiles * packn * 8; @@ -281,37 +281,37 @@ static void conv3x3s1_winograd63_transform_output_packn_rvv(const Mat& top_blob_ for (int m = 0; m < 6; m++) { - vfloat32m1_t _tmp00 = vle32_v_f32m1(tmp[m][0], vl); - vfloat32m1_t _tmp01 = vle32_v_f32m1(tmp[m][1], vl); - vfloat32m1_t _tmp02 = vle32_v_f32m1(tmp[m][2], vl); - vfloat32m1_t _tmp03 = vle32_v_f32m1(tmp[m][3], vl); - vfloat32m1_t _tmp04 = vle32_v_f32m1(tmp[m][4], vl); - vfloat32m1_t _tmp05 = vle32_v_f32m1(tmp[m][5], vl); - vfloat32m1_t _tmp06 = vle32_v_f32m1(tmp[m][6], vl); - vfloat32m1_t _tmp07 = vle32_v_f32m1(tmp[m][7], vl); - - vfloat32m1_t _tmp024a = vfadd_vv_f32m1(_tmp01, _tmp02, vl); - vfloat32m1_t _tmp135a = vfsub_vv_f32m1(_tmp01, _tmp02, vl); - - vfloat32m1_t _tmp024b = vfadd_vv_f32m1(_tmp03, _tmp04, vl); - vfloat32m1_t _tmp135b = vfsub_vv_f32m1(_tmp03, _tmp04, vl); - - vfloat32m1_t _tmp024c = vfadd_vv_f32m1(_tmp05, _tmp06, vl); - vfloat32m1_t _tmp135c = vfsub_vv_f32m1(_tmp05, _tmp06, vl); - - vfloat32m1_t _out00 = vfadd_vv_f32m1(_bias0, vfadd_vv_f32m1(vfadd_vv_f32m1(_tmp00, _tmp024a, vl), vfmacc_vf_f32m1(_tmp024b, 32.f, _tmp024c, vl), vl), vl); - vfloat32m1_t _out02 = vfadd_vv_f32m1(_bias0, vfmacc_vf_f32m1(vfmacc_vf_f32m1(_tmp024a, 4.f, _tmp024b, vl), 8.f, _tmp024c, vl), vl); - vfloat32m1_t _out04 = vfadd_vv_f32m1(_bias0, vfmacc_vf_f32m1(vfmacc_vf_f32m1(_tmp024a, 16.f, _tmp024b, vl), 2.f, _tmp024c, vl), vl); - vse32_v_f32m1(output0, _out00, vl); - vse32_v_f32m1(output0 + packn * 2, _out02, vl); - vse32_v_f32m1(output0 + packn * 4, _out04, vl); - - vfloat32m1_t _out01 = vfadd_vv_f32m1(_bias0, vfmacc_vf_f32m1(vfmacc_vf_f32m1(_tmp135a, 2.f, _tmp135b, vl), 16.f, _tmp135c, vl), vl); - vfloat32m1_t _out03 = vfadd_vv_f32m1(_bias0, vfmacc_vf_f32m1(vfmacc_vf_f32m1(_tmp135a, 8.f, _tmp135b, vl), 4.f, _tmp135c, vl), vl); - vfloat32m1_t _out05 = vfadd_vv_f32m1(_bias0, vfadd_vv_f32m1(vfadd_vv_f32m1(_tmp07, _tmp135a, vl), vfmacc_vf_f32m1(_tmp135c, 32.f, _tmp135b, vl), vl), vl); - vse32_v_f32m1(output0 + packn, _out01, vl); - vse32_v_f32m1(output0 + packn * 3, _out03, vl); - vse32_v_f32m1(output0 + packn * 5, _out05, vl); + vfloat32m1_t _tmp00 = __riscv_vle32_v_f32m1(tmp[m][0], vl); + vfloat32m1_t _tmp01 = __riscv_vle32_v_f32m1(tmp[m][1], vl); + vfloat32m1_t _tmp02 = __riscv_vle32_v_f32m1(tmp[m][2], vl); + vfloat32m1_t _tmp03 = __riscv_vle32_v_f32m1(tmp[m][3], vl); + vfloat32m1_t _tmp04 = __riscv_vle32_v_f32m1(tmp[m][4], vl); + vfloat32m1_t _tmp05 = __riscv_vle32_v_f32m1(tmp[m][5], vl); + vfloat32m1_t _tmp06 = __riscv_vle32_v_f32m1(tmp[m][6], vl); + vfloat32m1_t _tmp07 = __riscv_vle32_v_f32m1(tmp[m][7], vl); + + vfloat32m1_t _tmp024a = __riscv_vfadd_vv_f32m1(_tmp01, _tmp02, vl); + vfloat32m1_t _tmp135a = __riscv_vfsub_vv_f32m1(_tmp01, _tmp02, vl); + + vfloat32m1_t _tmp024b = __riscv_vfadd_vv_f32m1(_tmp03, _tmp04, vl); + vfloat32m1_t _tmp135b = __riscv_vfsub_vv_f32m1(_tmp03, _tmp04, vl); + + vfloat32m1_t _tmp024c = __riscv_vfadd_vv_f32m1(_tmp05, _tmp06, vl); + vfloat32m1_t _tmp135c = __riscv_vfsub_vv_f32m1(_tmp05, _tmp06, vl); + + vfloat32m1_t _out00 = __riscv_vfadd_vv_f32m1(_bias0, __riscv_vfadd_vv_f32m1(__riscv_vfadd_vv_f32m1(_tmp00, _tmp024a, vl), __riscv_vfmacc_vf_f32m1(_tmp024b, 32.f, _tmp024c, vl), vl), vl); + vfloat32m1_t _out02 = __riscv_vfadd_vv_f32m1(_bias0, __riscv_vfmacc_vf_f32m1(__riscv_vfmacc_vf_f32m1(_tmp024a, 4.f, _tmp024b, vl), 8.f, _tmp024c, vl), vl); + vfloat32m1_t _out04 = __riscv_vfadd_vv_f32m1(_bias0, __riscv_vfmacc_vf_f32m1(__riscv_vfmacc_vf_f32m1(_tmp024a, 16.f, _tmp024b, vl), 2.f, _tmp024c, vl), vl); + __riscv_vse32_v_f32m1(output0, _out00, vl); + __riscv_vse32_v_f32m1(output0 + packn * 2, _out02, vl); + __riscv_vse32_v_f32m1(output0 + packn * 4, _out04, vl); + + vfloat32m1_t _out01 = __riscv_vfadd_vv_f32m1(_bias0, __riscv_vfmacc_vf_f32m1(__riscv_vfmacc_vf_f32m1(_tmp135a, 2.f, _tmp135b, vl), 16.f, _tmp135c, vl), vl); + vfloat32m1_t _out03 = __riscv_vfadd_vv_f32m1(_bias0, __riscv_vfmacc_vf_f32m1(__riscv_vfmacc_vf_f32m1(_tmp135a, 8.f, _tmp135b, vl), 4.f, _tmp135c, vl), vl); + vfloat32m1_t _out05 = __riscv_vfadd_vv_f32m1(_bias0, __riscv_vfadd_vv_f32m1(__riscv_vfadd_vv_f32m1(_tmp07, _tmp135a, vl), __riscv_vfmacc_vf_f32m1(_tmp135c, 32.f, _tmp135b, vl), vl), vl); + __riscv_vse32_v_f32m1(output0 + packn, _out01, vl); + __riscv_vse32_v_f32m1(output0 + packn * 3, _out03, vl); + __riscv_vse32_v_f32m1(output0 + packn * 5, _out05, vl); output0 += outw * packn; } @@ -323,7 +323,7 @@ static void conv3x3s1_winograd63_transform_output_packn_rvv(const Mat& top_blob_ static void conv3x3s1_winograd43_transform_input_packn_rvv(const Mat& bottom_blob, Mat& bottom_blob_tm, const Option& opt) { const int packn = csrr_vlenb() / 4; - const size_t vl = vsetvl_e32m1(packn); + const size_t vl = __riscv_vsetvl_e32m1(packn); const int w = bottom_blob.w; const int h = bottom_blob.h; @@ -370,31 +370,31 @@ static void conv3x3s1_winograd43_transform_input_packn_rvv(const Mat& bottom_blo for (int m = 0; m < 6; m++) { - vfloat32m1_t _r00 = vle32_v_f32m1(r0, vl); - vfloat32m1_t _r01 = vle32_v_f32m1(r0 + packn, vl); - vfloat32m1_t _r02 = vle32_v_f32m1(r0 + packn * 2, vl); - vfloat32m1_t _r03 = vle32_v_f32m1(r0 + packn * 3, vl); - vfloat32m1_t _r04 = vle32_v_f32m1(r0 + packn * 4, vl); - vfloat32m1_t _r05 = vle32_v_f32m1(r0 + packn * 5, vl); - - vfloat32m1_t _tmp01a = vfmacc_vf_f32m1(vfmul_vf_f32m1(_r01, sq2, vl), -sq2_d2, _r03, vl); - vfloat32m1_t _tmp01b = vfmacc_vf_f32m1(_r04, -2.f, _r02, vl); - vfloat32m1_t _tmp23a = vfmacc_vf_f32m1(vfmul_vf_f32m1(_r01, sq2_d2, vl), -sq2, _r03, vl); - vfloat32m1_t _tmp23b = vfmacc_vf_f32m1(_r04, -0.5f, _r02, vl); - - vfloat32m1_t _tmp0m = vfmacc_vf_f32m1(vfadd_vv_f32m1(_r00, _r04, vl), -2.5f, _r02, vl); - vfloat32m1_t _tmp1m = vfsub_vv_f32m1(_tmp01b, _tmp01a, vl); - vfloat32m1_t _tmp2m = vfadd_vv_f32m1(_tmp01b, _tmp01a, vl); - vfloat32m1_t _tmp3m = vfsub_vv_f32m1(_tmp23b, _tmp23a, vl); - vfloat32m1_t _tmp4m = vfadd_vv_f32m1(_tmp23b, _tmp23a, vl); - vfloat32m1_t _tmp5m = vfmacc_vf_f32m1(vfadd_vv_f32m1(_r01, _r05, vl), -2.5f, _r03, vl); - - vse32_v_f32m1(tmp[0][m], _tmp0m, vl); - vse32_v_f32m1(tmp[1][m], _tmp1m, vl); - vse32_v_f32m1(tmp[2][m], _tmp2m, vl); - vse32_v_f32m1(tmp[3][m], _tmp3m, vl); - vse32_v_f32m1(tmp[4][m], _tmp4m, vl); - vse32_v_f32m1(tmp[5][m], _tmp5m, vl); + vfloat32m1_t _r00 = __riscv_vle32_v_f32m1(r0, vl); + vfloat32m1_t _r01 = __riscv_vle32_v_f32m1(r0 + packn, vl); + vfloat32m1_t _r02 = __riscv_vle32_v_f32m1(r0 + packn * 2, vl); + vfloat32m1_t _r03 = __riscv_vle32_v_f32m1(r0 + packn * 3, vl); + vfloat32m1_t _r04 = __riscv_vle32_v_f32m1(r0 + packn * 4, vl); + vfloat32m1_t _r05 = __riscv_vle32_v_f32m1(r0 + packn * 5, vl); + + vfloat32m1_t _tmp01a = __riscv_vfmacc_vf_f32m1(__riscv_vfmul_vf_f32m1(_r01, sq2, vl), -sq2_d2, _r03, vl); + vfloat32m1_t _tmp01b = __riscv_vfmacc_vf_f32m1(_r04, -2.f, _r02, vl); + vfloat32m1_t _tmp23a = __riscv_vfmacc_vf_f32m1(__riscv_vfmul_vf_f32m1(_r01, sq2_d2, vl), -sq2, _r03, vl); + vfloat32m1_t _tmp23b = __riscv_vfmacc_vf_f32m1(_r04, -0.5f, _r02, vl); + + vfloat32m1_t _tmp0m = __riscv_vfmacc_vf_f32m1(__riscv_vfadd_vv_f32m1(_r00, _r04, vl), -2.5f, _r02, vl); + vfloat32m1_t _tmp1m = __riscv_vfsub_vv_f32m1(_tmp01b, _tmp01a, vl); + vfloat32m1_t _tmp2m = __riscv_vfadd_vv_f32m1(_tmp01b, _tmp01a, vl); + vfloat32m1_t _tmp3m = __riscv_vfsub_vv_f32m1(_tmp23b, _tmp23a, vl); + vfloat32m1_t _tmp4m = __riscv_vfadd_vv_f32m1(_tmp23b, _tmp23a, vl); + vfloat32m1_t _tmp5m = __riscv_vfmacc_vf_f32m1(__riscv_vfadd_vv_f32m1(_r01, _r05, vl), -2.5f, _r03, vl); + + __riscv_vse32_v_f32m1(tmp[0][m], _tmp0m, vl); + __riscv_vse32_v_f32m1(tmp[1][m], _tmp1m, vl); + __riscv_vse32_v_f32m1(tmp[2][m], _tmp2m, vl); + __riscv_vse32_v_f32m1(tmp[3][m], _tmp3m, vl); + __riscv_vse32_v_f32m1(tmp[4][m], _tmp4m, vl); + __riscv_vse32_v_f32m1(tmp[5][m], _tmp5m, vl); r0 += w * packn; } @@ -408,31 +408,31 @@ static void conv3x3s1_winograd43_transform_input_packn_rvv(const Mat& bottom_blo for (int m = 0; m < 6; m++) { - vfloat32m1_t _r00 = vle32_v_f32m1(tmp[m][0], vl); - vfloat32m1_t _r01 = vle32_v_f32m1(tmp[m][1], vl); - vfloat32m1_t _r02 = vle32_v_f32m1(tmp[m][2], vl); - vfloat32m1_t _r03 = vle32_v_f32m1(tmp[m][3], vl); - vfloat32m1_t _r04 = vle32_v_f32m1(tmp[m][4], vl); - vfloat32m1_t _r05 = vle32_v_f32m1(tmp[m][5], vl); - - vfloat32m1_t _tmp01a = vfmacc_vf_f32m1(vfmul_vf_f32m1(_r01, sq2, vl), -sq2_d2, _r03, vl); - vfloat32m1_t _tmp01b = vfmacc_vf_f32m1(_r04, -2.f, _r02, vl); - vfloat32m1_t _tmp23a = vfmacc_vf_f32m1(vfmul_vf_f32m1(_r01, sq2_d2, vl), -sq2, _r03, vl); - vfloat32m1_t _tmp23b = vfmacc_vf_f32m1(_r04, -0.5f, _r02, vl); - - vfloat32m1_t _tmp0m = vfmacc_vf_f32m1(vfadd_vv_f32m1(_r00, _r04, vl), -2.5f, _r02, vl); - vfloat32m1_t _tmp1m = vfsub_vv_f32m1(_tmp01b, _tmp01a, vl); - vfloat32m1_t _tmp2m = vfadd_vv_f32m1(_tmp01b, _tmp01a, vl); - vfloat32m1_t _tmp3m = vfsub_vv_f32m1(_tmp23b, _tmp23a, vl); - vfloat32m1_t _tmp4m = vfadd_vv_f32m1(_tmp23b, _tmp23a, vl); - vfloat32m1_t _tmp5m = vfmacc_vf_f32m1(vfadd_vv_f32m1(_r01, _r05, vl), -2.5f, _r03, vl); - - vse32_v_f32m1(r0_tm_0, _tmp0m, vl); - vse32_v_f32m1(r0_tm_1, _tmp1m, vl); - vse32_v_f32m1(r0_tm_2, _tmp2m, vl); - vse32_v_f32m1(r0_tm_3, _tmp3m, vl); - vse32_v_f32m1(r0_tm_4, _tmp4m, vl); - vse32_v_f32m1(r0_tm_5, _tmp5m, vl); + vfloat32m1_t _r00 = __riscv_vle32_v_f32m1(tmp[m][0], vl); + vfloat32m1_t _r01 = __riscv_vle32_v_f32m1(tmp[m][1], vl); + vfloat32m1_t _r02 = __riscv_vle32_v_f32m1(tmp[m][2], vl); + vfloat32m1_t _r03 = __riscv_vle32_v_f32m1(tmp[m][3], vl); + vfloat32m1_t _r04 = __riscv_vle32_v_f32m1(tmp[m][4], vl); + vfloat32m1_t _r05 = __riscv_vle32_v_f32m1(tmp[m][5], vl); + + vfloat32m1_t _tmp01a = __riscv_vfmacc_vf_f32m1(__riscv_vfmul_vf_f32m1(_r01, sq2, vl), -sq2_d2, _r03, vl); + vfloat32m1_t _tmp01b = __riscv_vfmacc_vf_f32m1(_r04, -2.f, _r02, vl); + vfloat32m1_t _tmp23a = __riscv_vfmacc_vf_f32m1(__riscv_vfmul_vf_f32m1(_r01, sq2_d2, vl), -sq2, _r03, vl); + vfloat32m1_t _tmp23b = __riscv_vfmacc_vf_f32m1(_r04, -0.5f, _r02, vl); + + vfloat32m1_t _tmp0m = __riscv_vfmacc_vf_f32m1(__riscv_vfadd_vv_f32m1(_r00, _r04, vl), -2.5f, _r02, vl); + vfloat32m1_t _tmp1m = __riscv_vfsub_vv_f32m1(_tmp01b, _tmp01a, vl); + vfloat32m1_t _tmp2m = __riscv_vfadd_vv_f32m1(_tmp01b, _tmp01a, vl); + vfloat32m1_t _tmp3m = __riscv_vfsub_vv_f32m1(_tmp23b, _tmp23a, vl); + vfloat32m1_t _tmp4m = __riscv_vfadd_vv_f32m1(_tmp23b, _tmp23a, vl); + vfloat32m1_t _tmp5m = __riscv_vfmacc_vf_f32m1(__riscv_vfadd_vv_f32m1(_r01, _r05, vl), -2.5f, _r03, vl); + + __riscv_vse32_v_f32m1(r0_tm_0, _tmp0m, vl); + __riscv_vse32_v_f32m1(r0_tm_1, _tmp1m, vl); + __riscv_vse32_v_f32m1(r0_tm_2, _tmp2m, vl); + __riscv_vse32_v_f32m1(r0_tm_3, _tmp3m, vl); + __riscv_vse32_v_f32m1(r0_tm_4, _tmp4m, vl); + __riscv_vse32_v_f32m1(r0_tm_5, _tmp5m, vl); r0_tm_0 += tiles * packn * 6; r0_tm_1 += tiles * packn * 6; @@ -449,7 +449,7 @@ static void conv3x3s1_winograd43_transform_input_packn_rvv(const Mat& bottom_blo static void conv3x3s1_winograd43_transform_output_packn_rvv(const Mat& top_blob_tm, Mat& top_blob, const Mat& bias, const Option& opt) { const int packn = csrr_vlenb() / 4; - const size_t vl = vsetvl_e32m1(packn); + const size_t vl = __riscv_vsetvl_e32m1(packn); const int outw = top_blob.w; const int outh = top_blob.h; @@ -484,7 +484,7 @@ static void conv3x3s1_winograd43_transform_output_packn_rvv(const Mat& top_blob_ const Mat out0_tm = top_blob_tm.channel(p); Mat out0 = top_blob.channel(p); - vfloat32m1_t _bias0 = biasptr ? vle32_v_f32m1(biasptr + p * packn, vl) : vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _bias0 = biasptr ? __riscv_vle32_v_f32m1(biasptr + p * packn, vl) : __riscv_vfmv_v_f_f32m1(0.f, vl); // NOTE variable length array float tmp[4][6][packn]; @@ -505,27 +505,27 @@ static void conv3x3s1_winograd43_transform_output_packn_rvv(const Mat& top_blob_ for (int m = 0; m < 6; m++) { - vfloat32m1_t _r00 = vle32_v_f32m1(output0_tm_0, vl); - vfloat32m1_t _r01 = vle32_v_f32m1(output0_tm_1, vl); - vfloat32m1_t _r02 = vle32_v_f32m1(output0_tm_2, vl); - vfloat32m1_t _r03 = vle32_v_f32m1(output0_tm_3, vl); - vfloat32m1_t _r04 = vle32_v_f32m1(output0_tm_4, vl); - vfloat32m1_t _r05 = vle32_v_f32m1(output0_tm_5, vl); - - vfloat32m1_t _tmp02a = vfadd_vv_f32m1(_r01, _r02, vl); - vfloat32m1_t _tmp02b = vfadd_vv_f32m1(_r03, _r04, vl); - vfloat32m1_t _tmp13a = vfsub_vv_f32m1(_r01, _r02, vl); - vfloat32m1_t _tmp13b = vfsub_vv_f32m1(_r03, _r04, vl); - - vfloat32m1_t _tmp0m = vfadd_vv_f32m1(vfadd_vv_f32m1(_r00, _tmp02a, vl), _tmp02b, vl); - vfloat32m1_t _tmp1m = vfmacc_vf_f32m1(vfmul_vf_f32m1(_tmp13a, sq2_d2, vl), sq2, _tmp13b, vl); - vfloat32m1_t _tmp2m = vfmacc_vf_f32m1(vfmul_vf_f32m1(_tmp02a, 0.5f, vl), 2.f, _tmp02b, vl); - vfloat32m1_t _tmp3m = vfmacc_vf_f32m1(vfmacc_vf_f32m1(_r05, sq2_d4, _tmp13a, vl), sq2_m2, _tmp13b, vl); - - vse32_v_f32m1(tmp[0][m], _tmp0m, vl); - vse32_v_f32m1(tmp[1][m], _tmp1m, vl); - vse32_v_f32m1(tmp[2][m], _tmp2m, vl); - vse32_v_f32m1(tmp[3][m], _tmp3m, vl); + vfloat32m1_t _r00 = __riscv_vle32_v_f32m1(output0_tm_0, vl); + vfloat32m1_t _r01 = __riscv_vle32_v_f32m1(output0_tm_1, vl); + vfloat32m1_t _r02 = __riscv_vle32_v_f32m1(output0_tm_2, vl); + vfloat32m1_t _r03 = __riscv_vle32_v_f32m1(output0_tm_3, vl); + vfloat32m1_t _r04 = __riscv_vle32_v_f32m1(output0_tm_4, vl); + vfloat32m1_t _r05 = __riscv_vle32_v_f32m1(output0_tm_5, vl); + + vfloat32m1_t _tmp02a = __riscv_vfadd_vv_f32m1(_r01, _r02, vl); + vfloat32m1_t _tmp02b = __riscv_vfadd_vv_f32m1(_r03, _r04, vl); + vfloat32m1_t _tmp13a = __riscv_vfsub_vv_f32m1(_r01, _r02, vl); + vfloat32m1_t _tmp13b = __riscv_vfsub_vv_f32m1(_r03, _r04, vl); + + vfloat32m1_t _tmp0m = __riscv_vfadd_vv_f32m1(__riscv_vfadd_vv_f32m1(_r00, _tmp02a, vl), _tmp02b, vl); + vfloat32m1_t _tmp1m = __riscv_vfmacc_vf_f32m1(__riscv_vfmul_vf_f32m1(_tmp13a, sq2_d2, vl), sq2, _tmp13b, vl); + vfloat32m1_t _tmp2m = __riscv_vfmacc_vf_f32m1(__riscv_vfmul_vf_f32m1(_tmp02a, 0.5f, vl), 2.f, _tmp02b, vl); + vfloat32m1_t _tmp3m = __riscv_vfmacc_vf_f32m1(__riscv_vfmacc_vf_f32m1(_r05, sq2_d4, _tmp13a, vl), sq2_m2, _tmp13b, vl); + + __riscv_vse32_v_f32m1(tmp[0][m], _tmp0m, vl); + __riscv_vse32_v_f32m1(tmp[1][m], _tmp1m, vl); + __riscv_vse32_v_f32m1(tmp[2][m], _tmp2m, vl); + __riscv_vse32_v_f32m1(tmp[3][m], _tmp3m, vl); output0_tm_0 += tiles * packn * 6; output0_tm_1 += tiles * packn * 6; @@ -537,27 +537,27 @@ static void conv3x3s1_winograd43_transform_output_packn_rvv(const Mat& top_blob_ for (int m = 0; m < 4; m++) { - vfloat32m1_t _r00 = vle32_v_f32m1(tmp[m][0], vl); - vfloat32m1_t _r01 = vle32_v_f32m1(tmp[m][1], vl); - vfloat32m1_t _r02 = vle32_v_f32m1(tmp[m][2], vl); - vfloat32m1_t _r03 = vle32_v_f32m1(tmp[m][3], vl); - vfloat32m1_t _r04 = vle32_v_f32m1(tmp[m][4], vl); - vfloat32m1_t _r05 = vle32_v_f32m1(tmp[m][5], vl); - - vfloat32m1_t _tmp02a = vfadd_vv_f32m1(_r01, _r02, vl); - vfloat32m1_t _tmp02b = vfadd_vv_f32m1(_r03, _r04, vl); - vfloat32m1_t _tmp13a = vfsub_vv_f32m1(_r01, _r02, vl); - vfloat32m1_t _tmp13b = vfsub_vv_f32m1(_r03, _r04, vl); - - vfloat32m1_t _out00 = vfadd_vv_f32m1(_bias0, vfadd_vv_f32m1(vfadd_vv_f32m1(_r00, _tmp02a, vl), _tmp02b, vl), vl); - vfloat32m1_t _out01 = vfadd_vv_f32m1(_bias0, vfmacc_vf_f32m1(vfmul_vf_f32m1(_tmp13a, sq2_d2, vl), sq2, _tmp13b, vl), vl); - vfloat32m1_t _out02 = vfadd_vv_f32m1(_bias0, vfmacc_vf_f32m1(vfmul_vf_f32m1(_tmp02a, 0.5f, vl), 2.f, _tmp02b, vl), vl); - vfloat32m1_t _out03 = vfadd_vv_f32m1(_bias0, vfmacc_vf_f32m1(vfmacc_vf_f32m1(_r05, sq2_d4, _tmp13a, vl), sq2_m2, _tmp13b, vl), vl); - - vse32_v_f32m1(output0, _out00, vl); - vse32_v_f32m1(output0 + packn, _out01, vl); - vse32_v_f32m1(output0 + packn * 2, _out02, vl); - vse32_v_f32m1(output0 + packn * 3, _out03, vl); + vfloat32m1_t _r00 = __riscv_vle32_v_f32m1(tmp[m][0], vl); + vfloat32m1_t _r01 = __riscv_vle32_v_f32m1(tmp[m][1], vl); + vfloat32m1_t _r02 = __riscv_vle32_v_f32m1(tmp[m][2], vl); + vfloat32m1_t _r03 = __riscv_vle32_v_f32m1(tmp[m][3], vl); + vfloat32m1_t _r04 = __riscv_vle32_v_f32m1(tmp[m][4], vl); + vfloat32m1_t _r05 = __riscv_vle32_v_f32m1(tmp[m][5], vl); + + vfloat32m1_t _tmp02a = __riscv_vfadd_vv_f32m1(_r01, _r02, vl); + vfloat32m1_t _tmp02b = __riscv_vfadd_vv_f32m1(_r03, _r04, vl); + vfloat32m1_t _tmp13a = __riscv_vfsub_vv_f32m1(_r01, _r02, vl); + vfloat32m1_t _tmp13b = __riscv_vfsub_vv_f32m1(_r03, _r04, vl); + + vfloat32m1_t _out00 = __riscv_vfadd_vv_f32m1(_bias0, __riscv_vfadd_vv_f32m1(__riscv_vfadd_vv_f32m1(_r00, _tmp02a, vl), _tmp02b, vl), vl); + vfloat32m1_t _out01 = __riscv_vfadd_vv_f32m1(_bias0, __riscv_vfmacc_vf_f32m1(__riscv_vfmul_vf_f32m1(_tmp13a, sq2_d2, vl), sq2, _tmp13b, vl), vl); + vfloat32m1_t _out02 = __riscv_vfadd_vv_f32m1(_bias0, __riscv_vfmacc_vf_f32m1(__riscv_vfmul_vf_f32m1(_tmp02a, 0.5f, vl), 2.f, _tmp02b, vl), vl); + vfloat32m1_t _out03 = __riscv_vfadd_vv_f32m1(_bias0, __riscv_vfmacc_vf_f32m1(__riscv_vfmacc_vf_f32m1(_r05, sq2_d4, _tmp13a, vl), sq2_m2, _tmp13b, vl), vl); + + __riscv_vse32_v_f32m1(output0, _out00, vl); + __riscv_vse32_v_f32m1(output0 + packn, _out01, vl); + __riscv_vse32_v_f32m1(output0 + packn * 2, _out02, vl); + __riscv_vse32_v_f32m1(output0 + packn * 3, _out03, vl); output0 += outw * packn; } @@ -569,7 +569,7 @@ static void conv3x3s1_winograd43_transform_output_packn_rvv(const Mat& top_blob_ static void conv3x3s1_winograd23_transform_input_packn_rvv(const Mat& bottom_blob, Mat& bottom_blob_tm, const Option& opt) { const int packn = csrr_vlenb() / 4; - const size_t vl = vsetvl_e32m1(packn); + const size_t vl = __riscv_vsetvl_e32m1(packn); const int w = bottom_blob.w; const int h = bottom_blob.h; @@ -609,20 +609,20 @@ static void conv3x3s1_winograd23_transform_input_packn_rvv(const Mat& bottom_blo for (int m = 0; m < 4; m++) { - vfloat32m1_t _r00 = vle32_v_f32m1(r0, vl); - vfloat32m1_t _r01 = vle32_v_f32m1(r0 + packn, vl); - vfloat32m1_t _r02 = vle32_v_f32m1(r0 + packn * 2, vl); - vfloat32m1_t _r03 = vle32_v_f32m1(r0 + packn * 3, vl); + vfloat32m1_t _r00 = __riscv_vle32_v_f32m1(r0, vl); + vfloat32m1_t _r01 = __riscv_vle32_v_f32m1(r0 + packn, vl); + vfloat32m1_t _r02 = __riscv_vle32_v_f32m1(r0 + packn * 2, vl); + vfloat32m1_t _r03 = __riscv_vle32_v_f32m1(r0 + packn * 3, vl); - vfloat32m1_t _tmp0m = vfsub_vv_f32m1(_r00, _r02, vl); - vfloat32m1_t _tmp1m = vfadd_vv_f32m1(_r01, _r02, vl); - vfloat32m1_t _tmp2m = vfsub_vv_f32m1(_r02, _r01, vl); - vfloat32m1_t _tmp3m = vfsub_vv_f32m1(_r03, _r01, vl); + vfloat32m1_t _tmp0m = __riscv_vfsub_vv_f32m1(_r00, _r02, vl); + vfloat32m1_t _tmp1m = __riscv_vfadd_vv_f32m1(_r01, _r02, vl); + vfloat32m1_t _tmp2m = __riscv_vfsub_vv_f32m1(_r02, _r01, vl); + vfloat32m1_t _tmp3m = __riscv_vfsub_vv_f32m1(_r03, _r01, vl); - vse32_v_f32m1(tmp[0][m], _tmp0m, vl); - vse32_v_f32m1(tmp[1][m], _tmp1m, vl); - vse32_v_f32m1(tmp[2][m], _tmp2m, vl); - vse32_v_f32m1(tmp[3][m], _tmp3m, vl); + __riscv_vse32_v_f32m1(tmp[0][m], _tmp0m, vl); + __riscv_vse32_v_f32m1(tmp[1][m], _tmp1m, vl); + __riscv_vse32_v_f32m1(tmp[2][m], _tmp2m, vl); + __riscv_vse32_v_f32m1(tmp[3][m], _tmp3m, vl); r0 += w * packn; } @@ -634,20 +634,20 @@ static void conv3x3s1_winograd23_transform_input_packn_rvv(const Mat& bottom_blo for (int m = 0; m < 4; m++) { - vfloat32m1_t _tmp00 = vle32_v_f32m1(tmp[m][0], vl); - vfloat32m1_t _tmp01 = vle32_v_f32m1(tmp[m][1], vl); - vfloat32m1_t _tmp02 = vle32_v_f32m1(tmp[m][2], vl); - vfloat32m1_t _tmp03 = vle32_v_f32m1(tmp[m][3], vl); + vfloat32m1_t _tmp00 = __riscv_vle32_v_f32m1(tmp[m][0], vl); + vfloat32m1_t _tmp01 = __riscv_vle32_v_f32m1(tmp[m][1], vl); + vfloat32m1_t _tmp02 = __riscv_vle32_v_f32m1(tmp[m][2], vl); + vfloat32m1_t _tmp03 = __riscv_vle32_v_f32m1(tmp[m][3], vl); - vfloat32m1_t _r0tm0 = vfsub_vv_f32m1(_tmp00, _tmp02, vl); - vfloat32m1_t _r0tm1 = vfadd_vv_f32m1(_tmp01, _tmp02, vl); - vfloat32m1_t _r0tm2 = vfsub_vv_f32m1(_tmp02, _tmp01, vl); - vfloat32m1_t _r0tm3 = vfsub_vv_f32m1(_tmp03, _tmp01, vl); + vfloat32m1_t _r0tm0 = __riscv_vfsub_vv_f32m1(_tmp00, _tmp02, vl); + vfloat32m1_t _r0tm1 = __riscv_vfadd_vv_f32m1(_tmp01, _tmp02, vl); + vfloat32m1_t _r0tm2 = __riscv_vfsub_vv_f32m1(_tmp02, _tmp01, vl); + vfloat32m1_t _r0tm3 = __riscv_vfsub_vv_f32m1(_tmp03, _tmp01, vl); - vse32_v_f32m1(r0_tm_0, _r0tm0, vl); - vse32_v_f32m1(r0_tm_1, _r0tm1, vl); - vse32_v_f32m1(r0_tm_2, _r0tm2, vl); - vse32_v_f32m1(r0_tm_3, _r0tm3, vl); + __riscv_vse32_v_f32m1(r0_tm_0, _r0tm0, vl); + __riscv_vse32_v_f32m1(r0_tm_1, _r0tm1, vl); + __riscv_vse32_v_f32m1(r0_tm_2, _r0tm2, vl); + __riscv_vse32_v_f32m1(r0_tm_3, _r0tm3, vl); r0_tm_0 += tiles * packn * 4; r0_tm_1 += tiles * packn * 4; @@ -662,7 +662,7 @@ static void conv3x3s1_winograd23_transform_input_packn_rvv(const Mat& bottom_blo static void conv3x3s1_winograd23_transform_output_packn_rvv(const Mat& top_blob_tm, Mat& top_blob, const Mat& bias, const Option& opt) { const int packn = csrr_vlenb() / 4; - const size_t vl = vsetvl_e32m1(packn); + const size_t vl = __riscv_vsetvl_e32m1(packn); const int outw = top_blob.w; const int outh = top_blob.h; @@ -688,7 +688,7 @@ static void conv3x3s1_winograd23_transform_output_packn_rvv(const Mat& top_blob_ const Mat out0_tm = top_blob_tm.channel(p); Mat out0 = top_blob.channel(p); - vfloat32m1_t _bias0 = biasptr ? vle32_v_f32m1(biasptr + p * packn, vl) : vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _bias0 = biasptr ? __riscv_vle32_v_f32m1(biasptr + p * packn, vl) : __riscv_vfmv_v_f_f32m1(0.f, vl); // NOTE variable length array float tmp[2][4][packn]; @@ -707,16 +707,16 @@ static void conv3x3s1_winograd23_transform_output_packn_rvv(const Mat& top_blob_ for (int m = 0; m < 4; m++) { - vfloat32m1_t _out0tm0 = vle32_v_f32m1(output0_tm_0, vl); - vfloat32m1_t _out0tm1 = vle32_v_f32m1(output0_tm_1, vl); - vfloat32m1_t _out0tm2 = vle32_v_f32m1(output0_tm_2, vl); - vfloat32m1_t _out0tm3 = vle32_v_f32m1(output0_tm_3, vl); + vfloat32m1_t _out0tm0 = __riscv_vle32_v_f32m1(output0_tm_0, vl); + vfloat32m1_t _out0tm1 = __riscv_vle32_v_f32m1(output0_tm_1, vl); + vfloat32m1_t _out0tm2 = __riscv_vle32_v_f32m1(output0_tm_2, vl); + vfloat32m1_t _out0tm3 = __riscv_vle32_v_f32m1(output0_tm_3, vl); - vfloat32m1_t _tmp0m = vfadd_vv_f32m1(vfadd_vv_f32m1(_out0tm0, _out0tm1, vl), _out0tm2, vl); - vfloat32m1_t _tmp1m = vfadd_vv_f32m1(vfsub_vv_f32m1(_out0tm1, _out0tm2, vl), _out0tm3, vl); + vfloat32m1_t _tmp0m = __riscv_vfadd_vv_f32m1(__riscv_vfadd_vv_f32m1(_out0tm0, _out0tm1, vl), _out0tm2, vl); + vfloat32m1_t _tmp1m = __riscv_vfadd_vv_f32m1(__riscv_vfsub_vv_f32m1(_out0tm1, _out0tm2, vl), _out0tm3, vl); - vse32_v_f32m1(tmp[0][m], _tmp0m, vl); - vse32_v_f32m1(tmp[1][m], _tmp1m, vl); + __riscv_vse32_v_f32m1(tmp[0][m], _tmp0m, vl); + __riscv_vse32_v_f32m1(tmp[1][m], _tmp1m, vl); output0_tm_0 += tiles * packn * 4; output0_tm_1 += tiles * packn * 4; @@ -726,16 +726,16 @@ static void conv3x3s1_winograd23_transform_output_packn_rvv(const Mat& top_blob_ for (int m = 0; m < 2; m++) { - vfloat32m1_t _tmp00 = vle32_v_f32m1(tmp[m][0], vl); - vfloat32m1_t _tmp01 = vle32_v_f32m1(tmp[m][1], vl); - vfloat32m1_t _tmp02 = vle32_v_f32m1(tmp[m][2], vl); - vfloat32m1_t _tmp03 = vle32_v_f32m1(tmp[m][3], vl); + vfloat32m1_t _tmp00 = __riscv_vle32_v_f32m1(tmp[m][0], vl); + vfloat32m1_t _tmp01 = __riscv_vle32_v_f32m1(tmp[m][1], vl); + vfloat32m1_t _tmp02 = __riscv_vle32_v_f32m1(tmp[m][2], vl); + vfloat32m1_t _tmp03 = __riscv_vle32_v_f32m1(tmp[m][3], vl); - vfloat32m1_t _out00 = vfadd_vv_f32m1(_bias0, vfadd_vv_f32m1(vfadd_vv_f32m1(_tmp00, _tmp01, vl), _tmp02, vl), vl); - vfloat32m1_t _out01 = vfadd_vv_f32m1(_bias0, vfadd_vv_f32m1(vfsub_vv_f32m1(_tmp01, _tmp02, vl), _tmp03, vl), vl); + vfloat32m1_t _out00 = __riscv_vfadd_vv_f32m1(_bias0, __riscv_vfadd_vv_f32m1(__riscv_vfadd_vv_f32m1(_tmp00, _tmp01, vl), _tmp02, vl), vl); + vfloat32m1_t _out01 = __riscv_vfadd_vv_f32m1(_bias0, __riscv_vfadd_vv_f32m1(__riscv_vfsub_vv_f32m1(_tmp01, _tmp02, vl), _tmp03, vl), vl); - vse32_v_f32m1(output0, _out00, vl); - vse32_v_f32m1(output0 + packn, _out01, vl); + __riscv_vse32_v_f32m1(output0, _out00, vl); + __riscv_vse32_v_f32m1(output0 + packn, _out01, vl); output0 += outw * packn; } diff --git a/src/layer/riscv/convolution_winograd_transform_packn_fp16s.h b/src/layer/riscv/convolution_winograd_transform_packn_fp16s.h index 145ca220f56..3119f665d2f 100644 --- a/src/layer/riscv/convolution_winograd_transform_packn_fp16s.h +++ b/src/layer/riscv/convolution_winograd_transform_packn_fp16s.h @@ -15,7 +15,7 @@ static void conv3x3s1_winograd63_transform_input_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& bottom_blob_tm, const Option& opt) { const int packn = csrr_vlenb() / 2; - const size_t vl = vsetvl_e16m1(packn); + const size_t vl = __riscv_vsetvl_e16m1(packn); const int w = bottom_blob.w; const int h = bottom_blob.h; @@ -72,43 +72,43 @@ static void conv3x3s1_winograd63_transform_input_packn_fp16sa_rvv(const Mat& bot for (int m = 0; m < 8; m++) { - vfloat16m1_t _r00 = vle16_v_f16m1(r0, vl); - vfloat16m1_t _r01 = vle16_v_f16m1(r0 + packn, vl); - vfloat16m1_t _r02 = vle16_v_f16m1(r0 + packn * 2, vl); - vfloat16m1_t _r03 = vle16_v_f16m1(r0 + packn * 3, vl); - vfloat16m1_t _r04 = vle16_v_f16m1(r0 + packn * 4, vl); - vfloat16m1_t _r05 = vle16_v_f16m1(r0 + packn * 5, vl); - vfloat16m1_t _r06 = vle16_v_f16m1(r0 + packn * 6, vl); - vfloat16m1_t _r07 = vle16_v_f16m1(r0 + packn * 7, vl); - - vfloat16m1_t _tmp0m = vfmacc_vf_f16m1(vfsub_vv_f16m1(_r00, _r06, vl), 5.25f, vfsub_vv_f16m1(_r04, _r02, vl), vl); - vfloat16m1_t _tmp7m = vfmacc_vf_f16m1(vfsub_vv_f16m1(_r07, _r01, vl), 5.25f, vfsub_vv_f16m1(_r03, _r05, vl), vl); - vse16_v_f16m1(tmp[0][m], _tmp0m, vl); - vse16_v_f16m1(tmp[7][m], _tmp7m, vl); - - vfloat16m1_t _tmp12a = vfmacc_vf_f16m1(vfadd_vv_f16m1(_r02, _r06, vl), -4.25f, _r04, vl); - vfloat16m1_t _tmp12b = vfmacc_vf_f16m1(vfadd_vv_f16m1(_r01, _r05, vl), -4.25f, _r03, vl); - - vfloat16m1_t _tmp1m = vfadd_vv_f16m1(_tmp12a, _tmp12b, vl); - vfloat16m1_t _tmp2m = vfsub_vv_f16m1(_tmp12a, _tmp12b, vl); - vse16_v_f16m1(tmp[1][m], _tmp1m, vl); - vse16_v_f16m1(tmp[2][m], _tmp2m, vl); - - vfloat16m1_t _tmp34a = vfmacc_vf_f16m1(vfmacc_vf_f16m1(_r06, 0.25f, _r02, vl), -1.25f, _r04, vl); - vfloat16m1_t _tmp34b = vfmacc_vf_f16m1(vfmacc_vf_f16m1(vfmul_vf_f16m1(_r01, 0.5f, vl), -2.5f, _r03, vl), 2.f, _r05, vl); - - vfloat16m1_t _tmp3m = vfadd_vv_f16m1(_tmp34a, _tmp34b, vl); - vfloat16m1_t _tmp4m = vfsub_vv_f16m1(_tmp34a, _tmp34b, vl); - vse16_v_f16m1(tmp[3][m], _tmp3m, vl); - vse16_v_f16m1(tmp[4][m], _tmp4m, vl); - - vfloat16m1_t _tmp56a = vfmacc_vf_f16m1(_r06, 4.f, vfmacc_vf_f16m1(_r02, -1.25f, _r04, vl), vl); - vfloat16m1_t _tmp56b = vfmacc_vf_f16m1(vfmacc_vf_f16m1(vfmul_vf_f16m1(_r01, 2.f, vl), -2.5f, _r03, vl), 0.5f, _r05, vl); - - vfloat16m1_t _tmp5m = vfadd_vv_f16m1(_tmp56a, _tmp56b, vl); - vfloat16m1_t _tmp6m = vfsub_vv_f16m1(_tmp56a, _tmp56b, vl); - vse16_v_f16m1(tmp[5][m], _tmp5m, vl); - vse16_v_f16m1(tmp[6][m], _tmp6m, vl); + vfloat16m1_t _r00 = __riscv_vle16_v_f16m1(r0, vl); + vfloat16m1_t _r01 = __riscv_vle16_v_f16m1(r0 + packn, vl); + vfloat16m1_t _r02 = __riscv_vle16_v_f16m1(r0 + packn * 2, vl); + vfloat16m1_t _r03 = __riscv_vle16_v_f16m1(r0 + packn * 3, vl); + vfloat16m1_t _r04 = __riscv_vle16_v_f16m1(r0 + packn * 4, vl); + vfloat16m1_t _r05 = __riscv_vle16_v_f16m1(r0 + packn * 5, vl); + vfloat16m1_t _r06 = __riscv_vle16_v_f16m1(r0 + packn * 6, vl); + vfloat16m1_t _r07 = __riscv_vle16_v_f16m1(r0 + packn * 7, vl); + + vfloat16m1_t _tmp0m = __riscv_vfmacc_vf_f16m1(__riscv_vfsub_vv_f16m1(_r00, _r06, vl), 5.25f, __riscv_vfsub_vv_f16m1(_r04, _r02, vl), vl); + vfloat16m1_t _tmp7m = __riscv_vfmacc_vf_f16m1(__riscv_vfsub_vv_f16m1(_r07, _r01, vl), 5.25f, __riscv_vfsub_vv_f16m1(_r03, _r05, vl), vl); + __riscv_vse16_v_f16m1(tmp[0][m], _tmp0m, vl); + __riscv_vse16_v_f16m1(tmp[7][m], _tmp7m, vl); + + vfloat16m1_t _tmp12a = __riscv_vfmacc_vf_f16m1(__riscv_vfadd_vv_f16m1(_r02, _r06, vl), -4.25f, _r04, vl); + vfloat16m1_t _tmp12b = __riscv_vfmacc_vf_f16m1(__riscv_vfadd_vv_f16m1(_r01, _r05, vl), -4.25f, _r03, vl); + + vfloat16m1_t _tmp1m = __riscv_vfadd_vv_f16m1(_tmp12a, _tmp12b, vl); + vfloat16m1_t _tmp2m = __riscv_vfsub_vv_f16m1(_tmp12a, _tmp12b, vl); + __riscv_vse16_v_f16m1(tmp[1][m], _tmp1m, vl); + __riscv_vse16_v_f16m1(tmp[2][m], _tmp2m, vl); + + vfloat16m1_t _tmp34a = __riscv_vfmacc_vf_f16m1(__riscv_vfmacc_vf_f16m1(_r06, 0.25f, _r02, vl), -1.25f, _r04, vl); + vfloat16m1_t _tmp34b = __riscv_vfmacc_vf_f16m1(__riscv_vfmacc_vf_f16m1(__riscv_vfmul_vf_f16m1(_r01, 0.5f, vl), -2.5f, _r03, vl), 2.f, _r05, vl); + + vfloat16m1_t _tmp3m = __riscv_vfadd_vv_f16m1(_tmp34a, _tmp34b, vl); + vfloat16m1_t _tmp4m = __riscv_vfsub_vv_f16m1(_tmp34a, _tmp34b, vl); + __riscv_vse16_v_f16m1(tmp[3][m], _tmp3m, vl); + __riscv_vse16_v_f16m1(tmp[4][m], _tmp4m, vl); + + vfloat16m1_t _tmp56a = __riscv_vfmacc_vf_f16m1(_r06, 4.f, __riscv_vfmacc_vf_f16m1(_r02, -1.25f, _r04, vl), vl); + vfloat16m1_t _tmp56b = __riscv_vfmacc_vf_f16m1(__riscv_vfmacc_vf_f16m1(__riscv_vfmul_vf_f16m1(_r01, 2.f, vl), -2.5f, _r03, vl), 0.5f, _r05, vl); + + vfloat16m1_t _tmp5m = __riscv_vfadd_vv_f16m1(_tmp56a, _tmp56b, vl); + vfloat16m1_t _tmp6m = __riscv_vfsub_vv_f16m1(_tmp56a, _tmp56b, vl); + __riscv_vse16_v_f16m1(tmp[5][m], _tmp5m, vl); + __riscv_vse16_v_f16m1(tmp[6][m], _tmp6m, vl); r0 += w * packn; } @@ -124,44 +124,44 @@ static void conv3x3s1_winograd63_transform_input_packn_fp16sa_rvv(const Mat& bot for (int m = 0; m < 8; m++) { - vfloat16m1_t _tmp00 = vle16_v_f16m1(tmp[m][0], vl); - vfloat16m1_t _tmp01 = vle16_v_f16m1(tmp[m][1], vl); - vfloat16m1_t _tmp02 = vle16_v_f16m1(tmp[m][2], vl); - vfloat16m1_t _tmp03 = vle16_v_f16m1(tmp[m][3], vl); - vfloat16m1_t _tmp04 = vle16_v_f16m1(tmp[m][4], vl); - vfloat16m1_t _tmp05 = vle16_v_f16m1(tmp[m][5], vl); - vfloat16m1_t _tmp06 = vle16_v_f16m1(tmp[m][6], vl); - vfloat16m1_t _tmp07 = vle16_v_f16m1(tmp[m][7], vl); + vfloat16m1_t _tmp00 = __riscv_vle16_v_f16m1(tmp[m][0], vl); + vfloat16m1_t _tmp01 = __riscv_vle16_v_f16m1(tmp[m][1], vl); + vfloat16m1_t _tmp02 = __riscv_vle16_v_f16m1(tmp[m][2], vl); + vfloat16m1_t _tmp03 = __riscv_vle16_v_f16m1(tmp[m][3], vl); + vfloat16m1_t _tmp04 = __riscv_vle16_v_f16m1(tmp[m][4], vl); + vfloat16m1_t _tmp05 = __riscv_vle16_v_f16m1(tmp[m][5], vl); + vfloat16m1_t _tmp06 = __riscv_vle16_v_f16m1(tmp[m][6], vl); + vfloat16m1_t _tmp07 = __riscv_vle16_v_f16m1(tmp[m][7], vl); - vfloat16m1_t _r0tm0 = vfmacc_vf_f16m1(vfsub_vv_f16m1(_tmp00, _tmp06, vl), 5.25f, vfsub_vv_f16m1(_tmp04, _tmp02, vl), vl); - vfloat16m1_t _r0tm7 = vfmacc_vf_f16m1(vfsub_vv_f16m1(_tmp07, _tmp01, vl), 5.25f, vfsub_vv_f16m1(_tmp03, _tmp05, vl), vl); + vfloat16m1_t _r0tm0 = __riscv_vfmacc_vf_f16m1(__riscv_vfsub_vv_f16m1(_tmp00, _tmp06, vl), 5.25f, __riscv_vfsub_vv_f16m1(_tmp04, _tmp02, vl), vl); + vfloat16m1_t _r0tm7 = __riscv_vfmacc_vf_f16m1(__riscv_vfsub_vv_f16m1(_tmp07, _tmp01, vl), 5.25f, __riscv_vfsub_vv_f16m1(_tmp03, _tmp05, vl), vl); - vfloat16m1_t _tmp12a = vfmacc_vf_f16m1(vfadd_vv_f16m1(_tmp02, _tmp06, vl), -4.25f, _tmp04, vl); - vfloat16m1_t _tmp12b = vfmacc_vf_f16m1(vfadd_vv_f16m1(_tmp01, _tmp05, vl), -4.25f, _tmp03, vl); + vfloat16m1_t _tmp12a = __riscv_vfmacc_vf_f16m1(__riscv_vfadd_vv_f16m1(_tmp02, _tmp06, vl), -4.25f, _tmp04, vl); + vfloat16m1_t _tmp12b = __riscv_vfmacc_vf_f16m1(__riscv_vfadd_vv_f16m1(_tmp01, _tmp05, vl), -4.25f, _tmp03, vl); - vfloat16m1_t _r0tm1 = vfadd_vv_f16m1(_tmp12a, _tmp12b, vl); - vfloat16m1_t _r0tm2 = vfsub_vv_f16m1(_tmp12a, _tmp12b, vl); + vfloat16m1_t _r0tm1 = __riscv_vfadd_vv_f16m1(_tmp12a, _tmp12b, vl); + vfloat16m1_t _r0tm2 = __riscv_vfsub_vv_f16m1(_tmp12a, _tmp12b, vl); - vfloat16m1_t _tmp34a = vfmacc_vf_f16m1(vfmacc_vf_f16m1(_tmp06, 0.25f, _tmp02, vl), -1.25f, _tmp04, vl); - vfloat16m1_t _tmp34b = vfmacc_vf_f16m1(vfmacc_vf_f16m1(vfmul_vf_f16m1(_tmp01, 0.5f, vl), -2.5f, _tmp03, vl), 2.f, _tmp05, vl); + vfloat16m1_t _tmp34a = __riscv_vfmacc_vf_f16m1(__riscv_vfmacc_vf_f16m1(_tmp06, 0.25f, _tmp02, vl), -1.25f, _tmp04, vl); + vfloat16m1_t _tmp34b = __riscv_vfmacc_vf_f16m1(__riscv_vfmacc_vf_f16m1(__riscv_vfmul_vf_f16m1(_tmp01, 0.5f, vl), -2.5f, _tmp03, vl), 2.f, _tmp05, vl); - vfloat16m1_t _r0tm3 = vfadd_vv_f16m1(_tmp34a, _tmp34b, vl); - vfloat16m1_t _r0tm4 = vfsub_vv_f16m1(_tmp34a, _tmp34b, vl); + vfloat16m1_t _r0tm3 = __riscv_vfadd_vv_f16m1(_tmp34a, _tmp34b, vl); + vfloat16m1_t _r0tm4 = __riscv_vfsub_vv_f16m1(_tmp34a, _tmp34b, vl); - vfloat16m1_t _tmp56a = vfmacc_vf_f16m1(_tmp06, 4.f, vfmacc_vf_f16m1(_tmp02, -1.25f, _tmp04, vl), vl); - vfloat16m1_t _tmp56b = vfmacc_vf_f16m1(vfmacc_vf_f16m1(vfmul_vf_f16m1(_tmp01, 2.f, vl), -2.5f, _tmp03, vl), 0.5f, _tmp05, vl); + vfloat16m1_t _tmp56a = __riscv_vfmacc_vf_f16m1(_tmp06, 4.f, __riscv_vfmacc_vf_f16m1(_tmp02, -1.25f, _tmp04, vl), vl); + vfloat16m1_t _tmp56b = __riscv_vfmacc_vf_f16m1(__riscv_vfmacc_vf_f16m1(__riscv_vfmul_vf_f16m1(_tmp01, 2.f, vl), -2.5f, _tmp03, vl), 0.5f, _tmp05, vl); - vfloat16m1_t _r0tm5 = vfadd_vv_f16m1(_tmp56a, _tmp56b, vl); - vfloat16m1_t _r0tm6 = vfsub_vv_f16m1(_tmp56a, _tmp56b, vl); + vfloat16m1_t _r0tm5 = __riscv_vfadd_vv_f16m1(_tmp56a, _tmp56b, vl); + vfloat16m1_t _r0tm6 = __riscv_vfsub_vv_f16m1(_tmp56a, _tmp56b, vl); - vse16_v_f16m1(r0_tm_0, _r0tm0, vl); - vse16_v_f16m1(r0_tm_1, _r0tm1, vl); - vse16_v_f16m1(r0_tm_2, _r0tm2, vl); - vse16_v_f16m1(r0_tm_3, _r0tm3, vl); - vse16_v_f16m1(r0_tm_4, _r0tm4, vl); - vse16_v_f16m1(r0_tm_5, _r0tm5, vl); - vse16_v_f16m1(r0_tm_6, _r0tm6, vl); - vse16_v_f16m1(r0_tm_7, _r0tm7, vl); + __riscv_vse16_v_f16m1(r0_tm_0, _r0tm0, vl); + __riscv_vse16_v_f16m1(r0_tm_1, _r0tm1, vl); + __riscv_vse16_v_f16m1(r0_tm_2, _r0tm2, vl); + __riscv_vse16_v_f16m1(r0_tm_3, _r0tm3, vl); + __riscv_vse16_v_f16m1(r0_tm_4, _r0tm4, vl); + __riscv_vse16_v_f16m1(r0_tm_5, _r0tm5, vl); + __riscv_vse16_v_f16m1(r0_tm_6, _r0tm6, vl); + __riscv_vse16_v_f16m1(r0_tm_7, _r0tm7, vl); r0_tm_0 += tiles * packn * 8; r0_tm_1 += tiles * packn * 8; @@ -180,7 +180,7 @@ static void conv3x3s1_winograd63_transform_input_packn_fp16sa_rvv(const Mat& bot static void conv3x3s1_winograd63_transform_output_packn_fp16sa_rvv(const Mat& top_blob_tm, Mat& top_blob, const Mat& bias, const Option& opt) { const int packn = csrr_vlenb() / 2; - const size_t vl = vsetvl_e16m1(packn); + const size_t vl = __riscv_vsetvl_e16m1(packn); const int outw = top_blob.w; const int outh = top_blob.h; @@ -214,7 +214,7 @@ static void conv3x3s1_winograd63_transform_output_packn_fp16sa_rvv(const Mat& to const Mat out0_tm = top_blob_tm.channel(p); Mat out0 = top_blob.channel(p); - vfloat16m1_t _bias0 = biasptr ? vle16_v_f16m1(biasptr + p * packn, vl) : vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _bias0 = biasptr ? __riscv_vle16_v_f16m1(biasptr + p * packn, vl) : __riscv_vfmv_v_f_f16m1(0.f, vl); // NOTE c99 variable length array __fp16 tmp[6][8][packn]; @@ -237,37 +237,37 @@ static void conv3x3s1_winograd63_transform_output_packn_fp16sa_rvv(const Mat& to for (int m = 0; m < 8; m++) { - vfloat16m1_t _out0tm0 = vle16_v_f16m1(output0_tm_0, vl); - vfloat16m1_t _out0tm1 = vle16_v_f16m1(output0_tm_1, vl); - vfloat16m1_t _out0tm2 = vle16_v_f16m1(output0_tm_2, vl); - vfloat16m1_t _out0tm3 = vle16_v_f16m1(output0_tm_3, vl); - vfloat16m1_t _out0tm4 = vle16_v_f16m1(output0_tm_4, vl); - vfloat16m1_t _out0tm5 = vle16_v_f16m1(output0_tm_5, vl); - vfloat16m1_t _out0tm6 = vle16_v_f16m1(output0_tm_6, vl); - vfloat16m1_t _out0tm7 = vle16_v_f16m1(output0_tm_7, vl); - - vfloat16m1_t _tmp024a = vfadd_vv_f16m1(_out0tm1, _out0tm2, vl); - vfloat16m1_t _tmp135a = vfsub_vv_f16m1(_out0tm1, _out0tm2, vl); - - vfloat16m1_t _tmp024b = vfadd_vv_f16m1(_out0tm3, _out0tm4, vl); - vfloat16m1_t _tmp135b = vfsub_vv_f16m1(_out0tm3, _out0tm4, vl); - - vfloat16m1_t _tmp024c = vfadd_vv_f16m1(_out0tm5, _out0tm6, vl); - vfloat16m1_t _tmp135c = vfsub_vv_f16m1(_out0tm5, _out0tm6, vl); - - vfloat16m1_t _tmp0m = vfadd_vv_f16m1(vfadd_vv_f16m1(_out0tm0, _tmp024a, vl), vfmacc_vf_f16m1(_tmp024b, 32.f, _tmp024c, vl), vl); - vfloat16m1_t _tmp2m = vfmacc_vf_f16m1(vfmacc_vf_f16m1(_tmp024a, 4.f, _tmp024b, vl), 8.f, _tmp024c, vl); - vfloat16m1_t _tmp4m = vfmacc_vf_f16m1(vfmacc_vf_f16m1(_tmp024a, 16.f, _tmp024b, vl), 2.f, _tmp024c, vl); - vse16_v_f16m1(tmp[0][m], _tmp0m, vl); - vse16_v_f16m1(tmp[2][m], _tmp2m, vl); - vse16_v_f16m1(tmp[4][m], _tmp4m, vl); - - vfloat16m1_t _tmp1m = vfmacc_vf_f16m1(vfmacc_vf_f16m1(_tmp135a, 2.f, _tmp135b, vl), 16.f, _tmp135c, vl); - vfloat16m1_t _tmp3m = vfmacc_vf_f16m1(vfmacc_vf_f16m1(_tmp135a, 8.f, _tmp135b, vl), 4.f, _tmp135c, vl); - vfloat16m1_t _tmp5m = vfadd_vv_f16m1(vfadd_vv_f16m1(_out0tm7, _tmp135a, vl), vfmacc_vf_f16m1(_tmp135c, 32.f, _tmp135b, vl), vl); - vse16_v_f16m1(tmp[1][m], _tmp1m, vl); - vse16_v_f16m1(tmp[3][m], _tmp3m, vl); - vse16_v_f16m1(tmp[5][m], _tmp5m, vl); + vfloat16m1_t _out0tm0 = __riscv_vle16_v_f16m1(output0_tm_0, vl); + vfloat16m1_t _out0tm1 = __riscv_vle16_v_f16m1(output0_tm_1, vl); + vfloat16m1_t _out0tm2 = __riscv_vle16_v_f16m1(output0_tm_2, vl); + vfloat16m1_t _out0tm3 = __riscv_vle16_v_f16m1(output0_tm_3, vl); + vfloat16m1_t _out0tm4 = __riscv_vle16_v_f16m1(output0_tm_4, vl); + vfloat16m1_t _out0tm5 = __riscv_vle16_v_f16m1(output0_tm_5, vl); + vfloat16m1_t _out0tm6 = __riscv_vle16_v_f16m1(output0_tm_6, vl); + vfloat16m1_t _out0tm7 = __riscv_vle16_v_f16m1(output0_tm_7, vl); + + vfloat16m1_t _tmp024a = __riscv_vfadd_vv_f16m1(_out0tm1, _out0tm2, vl); + vfloat16m1_t _tmp135a = __riscv_vfsub_vv_f16m1(_out0tm1, _out0tm2, vl); + + vfloat16m1_t _tmp024b = __riscv_vfadd_vv_f16m1(_out0tm3, _out0tm4, vl); + vfloat16m1_t _tmp135b = __riscv_vfsub_vv_f16m1(_out0tm3, _out0tm4, vl); + + vfloat16m1_t _tmp024c = __riscv_vfadd_vv_f16m1(_out0tm5, _out0tm6, vl); + vfloat16m1_t _tmp135c = __riscv_vfsub_vv_f16m1(_out0tm5, _out0tm6, vl); + + vfloat16m1_t _tmp0m = __riscv_vfadd_vv_f16m1(__riscv_vfadd_vv_f16m1(_out0tm0, _tmp024a, vl), __riscv_vfmacc_vf_f16m1(_tmp024b, 32.f, _tmp024c, vl), vl); + vfloat16m1_t _tmp2m = __riscv_vfmacc_vf_f16m1(__riscv_vfmacc_vf_f16m1(_tmp024a, 4.f, _tmp024b, vl), 8.f, _tmp024c, vl); + vfloat16m1_t _tmp4m = __riscv_vfmacc_vf_f16m1(__riscv_vfmacc_vf_f16m1(_tmp024a, 16.f, _tmp024b, vl), 2.f, _tmp024c, vl); + __riscv_vse16_v_f16m1(tmp[0][m], _tmp0m, vl); + __riscv_vse16_v_f16m1(tmp[2][m], _tmp2m, vl); + __riscv_vse16_v_f16m1(tmp[4][m], _tmp4m, vl); + + vfloat16m1_t _tmp1m = __riscv_vfmacc_vf_f16m1(__riscv_vfmacc_vf_f16m1(_tmp135a, 2.f, _tmp135b, vl), 16.f, _tmp135c, vl); + vfloat16m1_t _tmp3m = __riscv_vfmacc_vf_f16m1(__riscv_vfmacc_vf_f16m1(_tmp135a, 8.f, _tmp135b, vl), 4.f, _tmp135c, vl); + vfloat16m1_t _tmp5m = __riscv_vfadd_vv_f16m1(__riscv_vfadd_vv_f16m1(_out0tm7, _tmp135a, vl), __riscv_vfmacc_vf_f16m1(_tmp135c, 32.f, _tmp135b, vl), vl); + __riscv_vse16_v_f16m1(tmp[1][m], _tmp1m, vl); + __riscv_vse16_v_f16m1(tmp[3][m], _tmp3m, vl); + __riscv_vse16_v_f16m1(tmp[5][m], _tmp5m, vl); output0_tm_0 += tiles * packn * 8; output0_tm_1 += tiles * packn * 8; @@ -281,37 +281,37 @@ static void conv3x3s1_winograd63_transform_output_packn_fp16sa_rvv(const Mat& to for (int m = 0; m < 6; m++) { - vfloat16m1_t _tmp00 = vle16_v_f16m1(tmp[m][0], vl); - vfloat16m1_t _tmp01 = vle16_v_f16m1(tmp[m][1], vl); - vfloat16m1_t _tmp02 = vle16_v_f16m1(tmp[m][2], vl); - vfloat16m1_t _tmp03 = vle16_v_f16m1(tmp[m][3], vl); - vfloat16m1_t _tmp04 = vle16_v_f16m1(tmp[m][4], vl); - vfloat16m1_t _tmp05 = vle16_v_f16m1(tmp[m][5], vl); - vfloat16m1_t _tmp06 = vle16_v_f16m1(tmp[m][6], vl); - vfloat16m1_t _tmp07 = vle16_v_f16m1(tmp[m][7], vl); - - vfloat16m1_t _tmp024a = vfadd_vv_f16m1(_tmp01, _tmp02, vl); - vfloat16m1_t _tmp135a = vfsub_vv_f16m1(_tmp01, _tmp02, vl); - - vfloat16m1_t _tmp024b = vfadd_vv_f16m1(_tmp03, _tmp04, vl); - vfloat16m1_t _tmp135b = vfsub_vv_f16m1(_tmp03, _tmp04, vl); - - vfloat16m1_t _tmp024c = vfadd_vv_f16m1(_tmp05, _tmp06, vl); - vfloat16m1_t _tmp135c = vfsub_vv_f16m1(_tmp05, _tmp06, vl); - - vfloat16m1_t _out00 = vfadd_vv_f16m1(_bias0, vfadd_vv_f16m1(vfadd_vv_f16m1(_tmp00, _tmp024a, vl), vfmacc_vf_f16m1(_tmp024b, 32.f, _tmp024c, vl), vl), vl); - vfloat16m1_t _out02 = vfadd_vv_f16m1(_bias0, vfmacc_vf_f16m1(vfmacc_vf_f16m1(_tmp024a, 4.f, _tmp024b, vl), 8.f, _tmp024c, vl), vl); - vfloat16m1_t _out04 = vfadd_vv_f16m1(_bias0, vfmacc_vf_f16m1(vfmacc_vf_f16m1(_tmp024a, 16.f, _tmp024b, vl), 2.f, _tmp024c, vl), vl); - vse16_v_f16m1(output0, _out00, vl); - vse16_v_f16m1(output0 + packn * 2, _out02, vl); - vse16_v_f16m1(output0 + packn * 4, _out04, vl); - - vfloat16m1_t _out01 = vfadd_vv_f16m1(_bias0, vfmacc_vf_f16m1(vfmacc_vf_f16m1(_tmp135a, 2.f, _tmp135b, vl), 16.f, _tmp135c, vl), vl); - vfloat16m1_t _out03 = vfadd_vv_f16m1(_bias0, vfmacc_vf_f16m1(vfmacc_vf_f16m1(_tmp135a, 8.f, _tmp135b, vl), 4.f, _tmp135c, vl), vl); - vfloat16m1_t _out05 = vfadd_vv_f16m1(_bias0, vfadd_vv_f16m1(vfadd_vv_f16m1(_tmp07, _tmp135a, vl), vfmacc_vf_f16m1(_tmp135c, 32.f, _tmp135b, vl), vl), vl); - vse16_v_f16m1(output0 + packn, _out01, vl); - vse16_v_f16m1(output0 + packn * 3, _out03, vl); - vse16_v_f16m1(output0 + packn * 5, _out05, vl); + vfloat16m1_t _tmp00 = __riscv_vle16_v_f16m1(tmp[m][0], vl); + vfloat16m1_t _tmp01 = __riscv_vle16_v_f16m1(tmp[m][1], vl); + vfloat16m1_t _tmp02 = __riscv_vle16_v_f16m1(tmp[m][2], vl); + vfloat16m1_t _tmp03 = __riscv_vle16_v_f16m1(tmp[m][3], vl); + vfloat16m1_t _tmp04 = __riscv_vle16_v_f16m1(tmp[m][4], vl); + vfloat16m1_t _tmp05 = __riscv_vle16_v_f16m1(tmp[m][5], vl); + vfloat16m1_t _tmp06 = __riscv_vle16_v_f16m1(tmp[m][6], vl); + vfloat16m1_t _tmp07 = __riscv_vle16_v_f16m1(tmp[m][7], vl); + + vfloat16m1_t _tmp024a = __riscv_vfadd_vv_f16m1(_tmp01, _tmp02, vl); + vfloat16m1_t _tmp135a = __riscv_vfsub_vv_f16m1(_tmp01, _tmp02, vl); + + vfloat16m1_t _tmp024b = __riscv_vfadd_vv_f16m1(_tmp03, _tmp04, vl); + vfloat16m1_t _tmp135b = __riscv_vfsub_vv_f16m1(_tmp03, _tmp04, vl); + + vfloat16m1_t _tmp024c = __riscv_vfadd_vv_f16m1(_tmp05, _tmp06, vl); + vfloat16m1_t _tmp135c = __riscv_vfsub_vv_f16m1(_tmp05, _tmp06, vl); + + vfloat16m1_t _out00 = __riscv_vfadd_vv_f16m1(_bias0, __riscv_vfadd_vv_f16m1(__riscv_vfadd_vv_f16m1(_tmp00, _tmp024a, vl), __riscv_vfmacc_vf_f16m1(_tmp024b, 32.f, _tmp024c, vl), vl), vl); + vfloat16m1_t _out02 = __riscv_vfadd_vv_f16m1(_bias0, __riscv_vfmacc_vf_f16m1(__riscv_vfmacc_vf_f16m1(_tmp024a, 4.f, _tmp024b, vl), 8.f, _tmp024c, vl), vl); + vfloat16m1_t _out04 = __riscv_vfadd_vv_f16m1(_bias0, __riscv_vfmacc_vf_f16m1(__riscv_vfmacc_vf_f16m1(_tmp024a, 16.f, _tmp024b, vl), 2.f, _tmp024c, vl), vl); + __riscv_vse16_v_f16m1(output0, _out00, vl); + __riscv_vse16_v_f16m1(output0 + packn * 2, _out02, vl); + __riscv_vse16_v_f16m1(output0 + packn * 4, _out04, vl); + + vfloat16m1_t _out01 = __riscv_vfadd_vv_f16m1(_bias0, __riscv_vfmacc_vf_f16m1(__riscv_vfmacc_vf_f16m1(_tmp135a, 2.f, _tmp135b, vl), 16.f, _tmp135c, vl), vl); + vfloat16m1_t _out03 = __riscv_vfadd_vv_f16m1(_bias0, __riscv_vfmacc_vf_f16m1(__riscv_vfmacc_vf_f16m1(_tmp135a, 8.f, _tmp135b, vl), 4.f, _tmp135c, vl), vl); + vfloat16m1_t _out05 = __riscv_vfadd_vv_f16m1(_bias0, __riscv_vfadd_vv_f16m1(__riscv_vfadd_vv_f16m1(_tmp07, _tmp135a, vl), __riscv_vfmacc_vf_f16m1(_tmp135c, 32.f, _tmp135b, vl), vl), vl); + __riscv_vse16_v_f16m1(output0 + packn, _out01, vl); + __riscv_vse16_v_f16m1(output0 + packn * 3, _out03, vl); + __riscv_vse16_v_f16m1(output0 + packn * 5, _out05, vl); output0 += outw * packn; } @@ -323,7 +323,7 @@ static void conv3x3s1_winograd63_transform_output_packn_fp16sa_rvv(const Mat& to static void conv3x3s1_winograd43_transform_input_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& bottom_blob_tm, const Option& opt) { const int packn = csrr_vlenb() / 2; - const size_t vl = vsetvl_e16m1(packn); + const size_t vl = __riscv_vsetvl_e16m1(packn); const int w = bottom_blob.w; const int h = bottom_blob.h; @@ -370,31 +370,31 @@ static void conv3x3s1_winograd43_transform_input_packn_fp16sa_rvv(const Mat& bot for (int m = 0; m < 6; m++) { - vfloat16m1_t _r00 = vle16_v_f16m1(r0, vl); - vfloat16m1_t _r01 = vle16_v_f16m1(r0 + packn, vl); - vfloat16m1_t _r02 = vle16_v_f16m1(r0 + packn * 2, vl); - vfloat16m1_t _r03 = vle16_v_f16m1(r0 + packn * 3, vl); - vfloat16m1_t _r04 = vle16_v_f16m1(r0 + packn * 4, vl); - vfloat16m1_t _r05 = vle16_v_f16m1(r0 + packn * 5, vl); - - vfloat16m1_t _tmp01a = vfmacc_vf_f16m1(vfmul_vf_f16m1(_r01, sq2, vl), -sq2_d2, _r03, vl); - vfloat16m1_t _tmp01b = vfmacc_vf_f16m1(_r04, -2.f, _r02, vl); - vfloat16m1_t _tmp23a = vfmacc_vf_f16m1(vfmul_vf_f16m1(_r01, sq2_d2, vl), -sq2, _r03, vl); - vfloat16m1_t _tmp23b = vfmacc_vf_f16m1(_r04, -0.5f, _r02, vl); - - vfloat16m1_t _tmp0m = vfmacc_vf_f16m1(vfadd_vv_f16m1(_r00, _r04, vl), -2.5f, _r02, vl); - vfloat16m1_t _tmp1m = vfsub_vv_f16m1(_tmp01b, _tmp01a, vl); - vfloat16m1_t _tmp2m = vfadd_vv_f16m1(_tmp01b, _tmp01a, vl); - vfloat16m1_t _tmp3m = vfsub_vv_f16m1(_tmp23b, _tmp23a, vl); - vfloat16m1_t _tmp4m = vfadd_vv_f16m1(_tmp23b, _tmp23a, vl); - vfloat16m1_t _tmp5m = vfmacc_vf_f16m1(vfadd_vv_f16m1(_r01, _r05, vl), -2.5f, _r03, vl); - - vse16_v_f16m1(tmp[0][m], _tmp0m, vl); - vse16_v_f16m1(tmp[1][m], _tmp1m, vl); - vse16_v_f16m1(tmp[2][m], _tmp2m, vl); - vse16_v_f16m1(tmp[3][m], _tmp3m, vl); - vse16_v_f16m1(tmp[4][m], _tmp4m, vl); - vse16_v_f16m1(tmp[5][m], _tmp5m, vl); + vfloat16m1_t _r00 = __riscv_vle16_v_f16m1(r0, vl); + vfloat16m1_t _r01 = __riscv_vle16_v_f16m1(r0 + packn, vl); + vfloat16m1_t _r02 = __riscv_vle16_v_f16m1(r0 + packn * 2, vl); + vfloat16m1_t _r03 = __riscv_vle16_v_f16m1(r0 + packn * 3, vl); + vfloat16m1_t _r04 = __riscv_vle16_v_f16m1(r0 + packn * 4, vl); + vfloat16m1_t _r05 = __riscv_vle16_v_f16m1(r0 + packn * 5, vl); + + vfloat16m1_t _tmp01a = __riscv_vfmacc_vf_f16m1(__riscv_vfmul_vf_f16m1(_r01, sq2, vl), -sq2_d2, _r03, vl); + vfloat16m1_t _tmp01b = __riscv_vfmacc_vf_f16m1(_r04, -2.f, _r02, vl); + vfloat16m1_t _tmp23a = __riscv_vfmacc_vf_f16m1(__riscv_vfmul_vf_f16m1(_r01, sq2_d2, vl), -sq2, _r03, vl); + vfloat16m1_t _tmp23b = __riscv_vfmacc_vf_f16m1(_r04, -0.5f, _r02, vl); + + vfloat16m1_t _tmp0m = __riscv_vfmacc_vf_f16m1(__riscv_vfadd_vv_f16m1(_r00, _r04, vl), -2.5f, _r02, vl); + vfloat16m1_t _tmp1m = __riscv_vfsub_vv_f16m1(_tmp01b, _tmp01a, vl); + vfloat16m1_t _tmp2m = __riscv_vfadd_vv_f16m1(_tmp01b, _tmp01a, vl); + vfloat16m1_t _tmp3m = __riscv_vfsub_vv_f16m1(_tmp23b, _tmp23a, vl); + vfloat16m1_t _tmp4m = __riscv_vfadd_vv_f16m1(_tmp23b, _tmp23a, vl); + vfloat16m1_t _tmp5m = __riscv_vfmacc_vf_f16m1(__riscv_vfadd_vv_f16m1(_r01, _r05, vl), -2.5f, _r03, vl); + + __riscv_vse16_v_f16m1(tmp[0][m], _tmp0m, vl); + __riscv_vse16_v_f16m1(tmp[1][m], _tmp1m, vl); + __riscv_vse16_v_f16m1(tmp[2][m], _tmp2m, vl); + __riscv_vse16_v_f16m1(tmp[3][m], _tmp3m, vl); + __riscv_vse16_v_f16m1(tmp[4][m], _tmp4m, vl); + __riscv_vse16_v_f16m1(tmp[5][m], _tmp5m, vl); r0 += w * packn; } @@ -408,31 +408,31 @@ static void conv3x3s1_winograd43_transform_input_packn_fp16sa_rvv(const Mat& bot for (int m = 0; m < 6; m++) { - vfloat16m1_t _r00 = vle16_v_f16m1(tmp[m][0], vl); - vfloat16m1_t _r01 = vle16_v_f16m1(tmp[m][1], vl); - vfloat16m1_t _r02 = vle16_v_f16m1(tmp[m][2], vl); - vfloat16m1_t _r03 = vle16_v_f16m1(tmp[m][3], vl); - vfloat16m1_t _r04 = vle16_v_f16m1(tmp[m][4], vl); - vfloat16m1_t _r05 = vle16_v_f16m1(tmp[m][5], vl); - - vfloat16m1_t _tmp01a = vfmacc_vf_f16m1(vfmul_vf_f16m1(_r01, sq2, vl), -sq2_d2, _r03, vl); - vfloat16m1_t _tmp01b = vfmacc_vf_f16m1(_r04, -2.f, _r02, vl); - vfloat16m1_t _tmp23a = vfmacc_vf_f16m1(vfmul_vf_f16m1(_r01, sq2_d2, vl), -sq2, _r03, vl); - vfloat16m1_t _tmp23b = vfmacc_vf_f16m1(_r04, -0.5f, _r02, vl); - - vfloat16m1_t _tmp0m = vfmacc_vf_f16m1(vfadd_vv_f16m1(_r00, _r04, vl), -2.5f, _r02, vl); - vfloat16m1_t _tmp1m = vfsub_vv_f16m1(_tmp01b, _tmp01a, vl); - vfloat16m1_t _tmp2m = vfadd_vv_f16m1(_tmp01b, _tmp01a, vl); - vfloat16m1_t _tmp3m = vfsub_vv_f16m1(_tmp23b, _tmp23a, vl); - vfloat16m1_t _tmp4m = vfadd_vv_f16m1(_tmp23b, _tmp23a, vl); - vfloat16m1_t _tmp5m = vfmacc_vf_f16m1(vfadd_vv_f16m1(_r01, _r05, vl), -2.5f, _r03, vl); - - vse16_v_f16m1(r0_tm_0, _tmp0m, vl); - vse16_v_f16m1(r0_tm_1, _tmp1m, vl); - vse16_v_f16m1(r0_tm_2, _tmp2m, vl); - vse16_v_f16m1(r0_tm_3, _tmp3m, vl); - vse16_v_f16m1(r0_tm_4, _tmp4m, vl); - vse16_v_f16m1(r0_tm_5, _tmp5m, vl); + vfloat16m1_t _r00 = __riscv_vle16_v_f16m1(tmp[m][0], vl); + vfloat16m1_t _r01 = __riscv_vle16_v_f16m1(tmp[m][1], vl); + vfloat16m1_t _r02 = __riscv_vle16_v_f16m1(tmp[m][2], vl); + vfloat16m1_t _r03 = __riscv_vle16_v_f16m1(tmp[m][3], vl); + vfloat16m1_t _r04 = __riscv_vle16_v_f16m1(tmp[m][4], vl); + vfloat16m1_t _r05 = __riscv_vle16_v_f16m1(tmp[m][5], vl); + + vfloat16m1_t _tmp01a = __riscv_vfmacc_vf_f16m1(__riscv_vfmul_vf_f16m1(_r01, sq2, vl), -sq2_d2, _r03, vl); + vfloat16m1_t _tmp01b = __riscv_vfmacc_vf_f16m1(_r04, -2.f, _r02, vl); + vfloat16m1_t _tmp23a = __riscv_vfmacc_vf_f16m1(__riscv_vfmul_vf_f16m1(_r01, sq2_d2, vl), -sq2, _r03, vl); + vfloat16m1_t _tmp23b = __riscv_vfmacc_vf_f16m1(_r04, -0.5f, _r02, vl); + + vfloat16m1_t _tmp0m = __riscv_vfmacc_vf_f16m1(__riscv_vfadd_vv_f16m1(_r00, _r04, vl), -2.5f, _r02, vl); + vfloat16m1_t _tmp1m = __riscv_vfsub_vv_f16m1(_tmp01b, _tmp01a, vl); + vfloat16m1_t _tmp2m = __riscv_vfadd_vv_f16m1(_tmp01b, _tmp01a, vl); + vfloat16m1_t _tmp3m = __riscv_vfsub_vv_f16m1(_tmp23b, _tmp23a, vl); + vfloat16m1_t _tmp4m = __riscv_vfadd_vv_f16m1(_tmp23b, _tmp23a, vl); + vfloat16m1_t _tmp5m = __riscv_vfmacc_vf_f16m1(__riscv_vfadd_vv_f16m1(_r01, _r05, vl), -2.5f, _r03, vl); + + __riscv_vse16_v_f16m1(r0_tm_0, _tmp0m, vl); + __riscv_vse16_v_f16m1(r0_tm_1, _tmp1m, vl); + __riscv_vse16_v_f16m1(r0_tm_2, _tmp2m, vl); + __riscv_vse16_v_f16m1(r0_tm_3, _tmp3m, vl); + __riscv_vse16_v_f16m1(r0_tm_4, _tmp4m, vl); + __riscv_vse16_v_f16m1(r0_tm_5, _tmp5m, vl); r0_tm_0 += tiles * packn * 6; r0_tm_1 += tiles * packn * 6; @@ -449,7 +449,7 @@ static void conv3x3s1_winograd43_transform_input_packn_fp16sa_rvv(const Mat& bot static void conv3x3s1_winograd43_transform_output_packn_fp16sa_rvv(const Mat& top_blob_tm, Mat& top_blob, const Mat& bias, const Option& opt) { const int packn = csrr_vlenb() / 2; - const size_t vl = vsetvl_e16m1(packn); + const size_t vl = __riscv_vsetvl_e16m1(packn); const int outw = top_blob.w; const int outh = top_blob.h; @@ -484,7 +484,7 @@ static void conv3x3s1_winograd43_transform_output_packn_fp16sa_rvv(const Mat& to const Mat out0_tm = top_blob_tm.channel(p); Mat out0 = top_blob.channel(p); - vfloat16m1_t _bias0 = biasptr ? vle16_v_f16m1(biasptr + p * packn, vl) : vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _bias0 = biasptr ? __riscv_vle16_v_f16m1(biasptr + p * packn, vl) : __riscv_vfmv_v_f_f16m1(0.f, vl); // NOTE variable length array __fp16 tmp[4][6][packn]; @@ -505,27 +505,27 @@ static void conv3x3s1_winograd43_transform_output_packn_fp16sa_rvv(const Mat& to for (int m = 0; m < 6; m++) { - vfloat16m1_t _r00 = vle16_v_f16m1(output0_tm_0, vl); - vfloat16m1_t _r01 = vle16_v_f16m1(output0_tm_1, vl); - vfloat16m1_t _r02 = vle16_v_f16m1(output0_tm_2, vl); - vfloat16m1_t _r03 = vle16_v_f16m1(output0_tm_3, vl); - vfloat16m1_t _r04 = vle16_v_f16m1(output0_tm_4, vl); - vfloat16m1_t _r05 = vle16_v_f16m1(output0_tm_5, vl); - - vfloat16m1_t _tmp02a = vfadd_vv_f16m1(_r01, _r02, vl); - vfloat16m1_t _tmp02b = vfadd_vv_f16m1(_r03, _r04, vl); - vfloat16m1_t _tmp13a = vfsub_vv_f16m1(_r01, _r02, vl); - vfloat16m1_t _tmp13b = vfsub_vv_f16m1(_r03, _r04, vl); - - vfloat16m1_t _tmp0m = vfadd_vv_f16m1(vfadd_vv_f16m1(_r00, _tmp02a, vl), _tmp02b, vl); - vfloat16m1_t _tmp1m = vfmacc_vf_f16m1(vfmul_vf_f16m1(_tmp13a, sq2_d2, vl), sq2, _tmp13b, vl); - vfloat16m1_t _tmp2m = vfmacc_vf_f16m1(vfmul_vf_f16m1(_tmp02a, 0.5f, vl), 2.f, _tmp02b, vl); - vfloat16m1_t _tmp3m = vfmacc_vf_f16m1(vfmacc_vf_f16m1(_r05, sq2_d4, _tmp13a, vl), sq2_m2, _tmp13b, vl); - - vse16_v_f16m1(tmp[0][m], _tmp0m, vl); - vse16_v_f16m1(tmp[1][m], _tmp1m, vl); - vse16_v_f16m1(tmp[2][m], _tmp2m, vl); - vse16_v_f16m1(tmp[3][m], _tmp3m, vl); + vfloat16m1_t _r00 = __riscv_vle16_v_f16m1(output0_tm_0, vl); + vfloat16m1_t _r01 = __riscv_vle16_v_f16m1(output0_tm_1, vl); + vfloat16m1_t _r02 = __riscv_vle16_v_f16m1(output0_tm_2, vl); + vfloat16m1_t _r03 = __riscv_vle16_v_f16m1(output0_tm_3, vl); + vfloat16m1_t _r04 = __riscv_vle16_v_f16m1(output0_tm_4, vl); + vfloat16m1_t _r05 = __riscv_vle16_v_f16m1(output0_tm_5, vl); + + vfloat16m1_t _tmp02a = __riscv_vfadd_vv_f16m1(_r01, _r02, vl); + vfloat16m1_t _tmp02b = __riscv_vfadd_vv_f16m1(_r03, _r04, vl); + vfloat16m1_t _tmp13a = __riscv_vfsub_vv_f16m1(_r01, _r02, vl); + vfloat16m1_t _tmp13b = __riscv_vfsub_vv_f16m1(_r03, _r04, vl); + + vfloat16m1_t _tmp0m = __riscv_vfadd_vv_f16m1(__riscv_vfadd_vv_f16m1(_r00, _tmp02a, vl), _tmp02b, vl); + vfloat16m1_t _tmp1m = __riscv_vfmacc_vf_f16m1(__riscv_vfmul_vf_f16m1(_tmp13a, sq2_d2, vl), sq2, _tmp13b, vl); + vfloat16m1_t _tmp2m = __riscv_vfmacc_vf_f16m1(__riscv_vfmul_vf_f16m1(_tmp02a, 0.5f, vl), 2.f, _tmp02b, vl); + vfloat16m1_t _tmp3m = __riscv_vfmacc_vf_f16m1(__riscv_vfmacc_vf_f16m1(_r05, sq2_d4, _tmp13a, vl), sq2_m2, _tmp13b, vl); + + __riscv_vse16_v_f16m1(tmp[0][m], _tmp0m, vl); + __riscv_vse16_v_f16m1(tmp[1][m], _tmp1m, vl); + __riscv_vse16_v_f16m1(tmp[2][m], _tmp2m, vl); + __riscv_vse16_v_f16m1(tmp[3][m], _tmp3m, vl); output0_tm_0 += tiles * packn * 6; output0_tm_1 += tiles * packn * 6; @@ -537,27 +537,27 @@ static void conv3x3s1_winograd43_transform_output_packn_fp16sa_rvv(const Mat& to for (int m = 0; m < 4; m++) { - vfloat16m1_t _r00 = vle16_v_f16m1(tmp[m][0], vl); - vfloat16m1_t _r01 = vle16_v_f16m1(tmp[m][1], vl); - vfloat16m1_t _r02 = vle16_v_f16m1(tmp[m][2], vl); - vfloat16m1_t _r03 = vle16_v_f16m1(tmp[m][3], vl); - vfloat16m1_t _r04 = vle16_v_f16m1(tmp[m][4], vl); - vfloat16m1_t _r05 = vle16_v_f16m1(tmp[m][5], vl); - - vfloat16m1_t _tmp02a = vfadd_vv_f16m1(_r01, _r02, vl); - vfloat16m1_t _tmp02b = vfadd_vv_f16m1(_r03, _r04, vl); - vfloat16m1_t _tmp13a = vfsub_vv_f16m1(_r01, _r02, vl); - vfloat16m1_t _tmp13b = vfsub_vv_f16m1(_r03, _r04, vl); - - vfloat16m1_t _out00 = vfadd_vv_f16m1(_bias0, vfadd_vv_f16m1(vfadd_vv_f16m1(_r00, _tmp02a, vl), _tmp02b, vl), vl); - vfloat16m1_t _out01 = vfadd_vv_f16m1(_bias0, vfmacc_vf_f16m1(vfmul_vf_f16m1(_tmp13a, sq2_d2, vl), sq2, _tmp13b, vl), vl); - vfloat16m1_t _out02 = vfadd_vv_f16m1(_bias0, vfmacc_vf_f16m1(vfmul_vf_f16m1(_tmp02a, 0.5f, vl), 2.f, _tmp02b, vl), vl); - vfloat16m1_t _out03 = vfadd_vv_f16m1(_bias0, vfmacc_vf_f16m1(vfmacc_vf_f16m1(_r05, sq2_d4, _tmp13a, vl), sq2_m2, _tmp13b, vl), vl); - - vse16_v_f16m1(output0, _out00, vl); - vse16_v_f16m1(output0 + packn, _out01, vl); - vse16_v_f16m1(output0 + packn * 2, _out02, vl); - vse16_v_f16m1(output0 + packn * 3, _out03, vl); + vfloat16m1_t _r00 = __riscv_vle16_v_f16m1(tmp[m][0], vl); + vfloat16m1_t _r01 = __riscv_vle16_v_f16m1(tmp[m][1], vl); + vfloat16m1_t _r02 = __riscv_vle16_v_f16m1(tmp[m][2], vl); + vfloat16m1_t _r03 = __riscv_vle16_v_f16m1(tmp[m][3], vl); + vfloat16m1_t _r04 = __riscv_vle16_v_f16m1(tmp[m][4], vl); + vfloat16m1_t _r05 = __riscv_vle16_v_f16m1(tmp[m][5], vl); + + vfloat16m1_t _tmp02a = __riscv_vfadd_vv_f16m1(_r01, _r02, vl); + vfloat16m1_t _tmp02b = __riscv_vfadd_vv_f16m1(_r03, _r04, vl); + vfloat16m1_t _tmp13a = __riscv_vfsub_vv_f16m1(_r01, _r02, vl); + vfloat16m1_t _tmp13b = __riscv_vfsub_vv_f16m1(_r03, _r04, vl); + + vfloat16m1_t _out00 = __riscv_vfadd_vv_f16m1(_bias0, __riscv_vfadd_vv_f16m1(__riscv_vfadd_vv_f16m1(_r00, _tmp02a, vl), _tmp02b, vl), vl); + vfloat16m1_t _out01 = __riscv_vfadd_vv_f16m1(_bias0, __riscv_vfmacc_vf_f16m1(__riscv_vfmul_vf_f16m1(_tmp13a, sq2_d2, vl), sq2, _tmp13b, vl), vl); + vfloat16m1_t _out02 = __riscv_vfadd_vv_f16m1(_bias0, __riscv_vfmacc_vf_f16m1(__riscv_vfmul_vf_f16m1(_tmp02a, 0.5f, vl), 2.f, _tmp02b, vl), vl); + vfloat16m1_t _out03 = __riscv_vfadd_vv_f16m1(_bias0, __riscv_vfmacc_vf_f16m1(__riscv_vfmacc_vf_f16m1(_r05, sq2_d4, _tmp13a, vl), sq2_m2, _tmp13b, vl), vl); + + __riscv_vse16_v_f16m1(output0, _out00, vl); + __riscv_vse16_v_f16m1(output0 + packn, _out01, vl); + __riscv_vse16_v_f16m1(output0 + packn * 2, _out02, vl); + __riscv_vse16_v_f16m1(output0 + packn * 3, _out03, vl); output0 += outw * packn; } @@ -569,7 +569,7 @@ static void conv3x3s1_winograd43_transform_output_packn_fp16sa_rvv(const Mat& to static void conv3x3s1_winograd23_transform_input_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& bottom_blob_tm, const Option& opt) { const int packn = csrr_vlenb() / 2; - const size_t vl = vsetvl_e16m1(packn); + const size_t vl = __riscv_vsetvl_e16m1(packn); const int w = bottom_blob.w; const int h = bottom_blob.h; @@ -609,20 +609,20 @@ static void conv3x3s1_winograd23_transform_input_packn_fp16sa_rvv(const Mat& bot for (int m = 0; m < 4; m++) { - vfloat16m1_t _r00 = vle16_v_f16m1(r0, vl); - vfloat16m1_t _r01 = vle16_v_f16m1(r0 + packn, vl); - vfloat16m1_t _r02 = vle16_v_f16m1(r0 + packn * 2, vl); - vfloat16m1_t _r03 = vle16_v_f16m1(r0 + packn * 3, vl); + vfloat16m1_t _r00 = __riscv_vle16_v_f16m1(r0, vl); + vfloat16m1_t _r01 = __riscv_vle16_v_f16m1(r0 + packn, vl); + vfloat16m1_t _r02 = __riscv_vle16_v_f16m1(r0 + packn * 2, vl); + vfloat16m1_t _r03 = __riscv_vle16_v_f16m1(r0 + packn * 3, vl); - vfloat16m1_t _tmp0m = vfsub_vv_f16m1(_r00, _r02, vl); - vfloat16m1_t _tmp1m = vfadd_vv_f16m1(_r01, _r02, vl); - vfloat16m1_t _tmp2m = vfsub_vv_f16m1(_r02, _r01, vl); - vfloat16m1_t _tmp3m = vfsub_vv_f16m1(_r03, _r01, vl); + vfloat16m1_t _tmp0m = __riscv_vfsub_vv_f16m1(_r00, _r02, vl); + vfloat16m1_t _tmp1m = __riscv_vfadd_vv_f16m1(_r01, _r02, vl); + vfloat16m1_t _tmp2m = __riscv_vfsub_vv_f16m1(_r02, _r01, vl); + vfloat16m1_t _tmp3m = __riscv_vfsub_vv_f16m1(_r03, _r01, vl); - vse16_v_f16m1(tmp[0][m], _tmp0m, vl); - vse16_v_f16m1(tmp[1][m], _tmp1m, vl); - vse16_v_f16m1(tmp[2][m], _tmp2m, vl); - vse16_v_f16m1(tmp[3][m], _tmp3m, vl); + __riscv_vse16_v_f16m1(tmp[0][m], _tmp0m, vl); + __riscv_vse16_v_f16m1(tmp[1][m], _tmp1m, vl); + __riscv_vse16_v_f16m1(tmp[2][m], _tmp2m, vl); + __riscv_vse16_v_f16m1(tmp[3][m], _tmp3m, vl); r0 += w * packn; } @@ -634,20 +634,20 @@ static void conv3x3s1_winograd23_transform_input_packn_fp16sa_rvv(const Mat& bot for (int m = 0; m < 4; m++) { - vfloat16m1_t _tmp00 = vle16_v_f16m1(tmp[m][0], vl); - vfloat16m1_t _tmp01 = vle16_v_f16m1(tmp[m][1], vl); - vfloat16m1_t _tmp02 = vle16_v_f16m1(tmp[m][2], vl); - vfloat16m1_t _tmp03 = vle16_v_f16m1(tmp[m][3], vl); + vfloat16m1_t _tmp00 = __riscv_vle16_v_f16m1(tmp[m][0], vl); + vfloat16m1_t _tmp01 = __riscv_vle16_v_f16m1(tmp[m][1], vl); + vfloat16m1_t _tmp02 = __riscv_vle16_v_f16m1(tmp[m][2], vl); + vfloat16m1_t _tmp03 = __riscv_vle16_v_f16m1(tmp[m][3], vl); - vfloat16m1_t _r0tm0 = vfsub_vv_f16m1(_tmp00, _tmp02, vl); - vfloat16m1_t _r0tm1 = vfadd_vv_f16m1(_tmp01, _tmp02, vl); - vfloat16m1_t _r0tm2 = vfsub_vv_f16m1(_tmp02, _tmp01, vl); - vfloat16m1_t _r0tm3 = vfsub_vv_f16m1(_tmp03, _tmp01, vl); + vfloat16m1_t _r0tm0 = __riscv_vfsub_vv_f16m1(_tmp00, _tmp02, vl); + vfloat16m1_t _r0tm1 = __riscv_vfadd_vv_f16m1(_tmp01, _tmp02, vl); + vfloat16m1_t _r0tm2 = __riscv_vfsub_vv_f16m1(_tmp02, _tmp01, vl); + vfloat16m1_t _r0tm3 = __riscv_vfsub_vv_f16m1(_tmp03, _tmp01, vl); - vse16_v_f16m1(r0_tm_0, _r0tm0, vl); - vse16_v_f16m1(r0_tm_1, _r0tm1, vl); - vse16_v_f16m1(r0_tm_2, _r0tm2, vl); - vse16_v_f16m1(r0_tm_3, _r0tm3, vl); + __riscv_vse16_v_f16m1(r0_tm_0, _r0tm0, vl); + __riscv_vse16_v_f16m1(r0_tm_1, _r0tm1, vl); + __riscv_vse16_v_f16m1(r0_tm_2, _r0tm2, vl); + __riscv_vse16_v_f16m1(r0_tm_3, _r0tm3, vl); r0_tm_0 += tiles * packn * 4; r0_tm_1 += tiles * packn * 4; @@ -662,7 +662,7 @@ static void conv3x3s1_winograd23_transform_input_packn_fp16sa_rvv(const Mat& bot static void conv3x3s1_winograd23_transform_output_packn_fp16sa_rvv(const Mat& top_blob_tm, Mat& top_blob, const Mat& bias, const Option& opt) { const int packn = csrr_vlenb() / 2; - const size_t vl = vsetvl_e16m1(packn); + const size_t vl = __riscv_vsetvl_e16m1(packn); const int outw = top_blob.w; const int outh = top_blob.h; @@ -688,7 +688,7 @@ static void conv3x3s1_winograd23_transform_output_packn_fp16sa_rvv(const Mat& to const Mat out0_tm = top_blob_tm.channel(p); Mat out0 = top_blob.channel(p); - vfloat16m1_t _bias0 = biasptr ? vle16_v_f16m1(biasptr + p * packn, vl) : vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _bias0 = biasptr ? __riscv_vle16_v_f16m1(biasptr + p * packn, vl) : __riscv_vfmv_v_f_f16m1(0.f, vl); // NOTE variable length array __fp16 tmp[2][4][packn]; @@ -707,16 +707,16 @@ static void conv3x3s1_winograd23_transform_output_packn_fp16sa_rvv(const Mat& to for (int m = 0; m < 4; m++) { - vfloat16m1_t _out0tm0 = vle16_v_f16m1(output0_tm_0, vl); - vfloat16m1_t _out0tm1 = vle16_v_f16m1(output0_tm_1, vl); - vfloat16m1_t _out0tm2 = vle16_v_f16m1(output0_tm_2, vl); - vfloat16m1_t _out0tm3 = vle16_v_f16m1(output0_tm_3, vl); + vfloat16m1_t _out0tm0 = __riscv_vle16_v_f16m1(output0_tm_0, vl); + vfloat16m1_t _out0tm1 = __riscv_vle16_v_f16m1(output0_tm_1, vl); + vfloat16m1_t _out0tm2 = __riscv_vle16_v_f16m1(output0_tm_2, vl); + vfloat16m1_t _out0tm3 = __riscv_vle16_v_f16m1(output0_tm_3, vl); - vfloat16m1_t _tmp0m = vfadd_vv_f16m1(vfadd_vv_f16m1(_out0tm0, _out0tm1, vl), _out0tm2, vl); - vfloat16m1_t _tmp1m = vfadd_vv_f16m1(vfsub_vv_f16m1(_out0tm1, _out0tm2, vl), _out0tm3, vl); + vfloat16m1_t _tmp0m = __riscv_vfadd_vv_f16m1(__riscv_vfadd_vv_f16m1(_out0tm0, _out0tm1, vl), _out0tm2, vl); + vfloat16m1_t _tmp1m = __riscv_vfadd_vv_f16m1(__riscv_vfsub_vv_f16m1(_out0tm1, _out0tm2, vl), _out0tm3, vl); - vse16_v_f16m1(tmp[0][m], _tmp0m, vl); - vse16_v_f16m1(tmp[1][m], _tmp1m, vl); + __riscv_vse16_v_f16m1(tmp[0][m], _tmp0m, vl); + __riscv_vse16_v_f16m1(tmp[1][m], _tmp1m, vl); output0_tm_0 += tiles * packn * 4; output0_tm_1 += tiles * packn * 4; @@ -726,16 +726,16 @@ static void conv3x3s1_winograd23_transform_output_packn_fp16sa_rvv(const Mat& to for (int m = 0; m < 2; m++) { - vfloat16m1_t _tmp00 = vle16_v_f16m1(tmp[m][0], vl); - vfloat16m1_t _tmp01 = vle16_v_f16m1(tmp[m][1], vl); - vfloat16m1_t _tmp02 = vle16_v_f16m1(tmp[m][2], vl); - vfloat16m1_t _tmp03 = vle16_v_f16m1(tmp[m][3], vl); + vfloat16m1_t _tmp00 = __riscv_vle16_v_f16m1(tmp[m][0], vl); + vfloat16m1_t _tmp01 = __riscv_vle16_v_f16m1(tmp[m][1], vl); + vfloat16m1_t _tmp02 = __riscv_vle16_v_f16m1(tmp[m][2], vl); + vfloat16m1_t _tmp03 = __riscv_vle16_v_f16m1(tmp[m][3], vl); - vfloat16m1_t _out00 = vfadd_vv_f16m1(_bias0, vfadd_vv_f16m1(vfadd_vv_f16m1(_tmp00, _tmp01, vl), _tmp02, vl), vl); - vfloat16m1_t _out01 = vfadd_vv_f16m1(_bias0, vfadd_vv_f16m1(vfsub_vv_f16m1(_tmp01, _tmp02, vl), _tmp03, vl), vl); + vfloat16m1_t _out00 = __riscv_vfadd_vv_f16m1(_bias0, __riscv_vfadd_vv_f16m1(__riscv_vfadd_vv_f16m1(_tmp00, _tmp01, vl), _tmp02, vl), vl); + vfloat16m1_t _out01 = __riscv_vfadd_vv_f16m1(_bias0, __riscv_vfadd_vv_f16m1(__riscv_vfsub_vv_f16m1(_tmp01, _tmp02, vl), _tmp03, vl), vl); - vse16_v_f16m1(output0, _out00, vl); - vse16_v_f16m1(output0 + packn, _out01, vl); + __riscv_vse16_v_f16m1(output0, _out00, vl); + __riscv_vse16_v_f16m1(output0 + packn, _out01, vl); output0 += outw * packn; } diff --git a/src/layer/riscv/convolutiondepthwise_3x3_packn.h b/src/layer/riscv/convolutiondepthwise_3x3_packn.h index 0cab1af0802..a2f27353715 100644 --- a/src/layer/riscv/convolutiondepthwise_3x3_packn.h +++ b/src/layer/riscv/convolutiondepthwise_3x3_packn.h @@ -15,7 +15,7 @@ static void convdw3x3s1_packn_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) { const int packn = csrr_vlenb() / 4; - const size_t vl = vsetvl_e32m1(packn); + const size_t vl = __riscv_vsetvl_e32m1(packn); int w = bottom_blob.w; @@ -31,7 +31,7 @@ static void convdw3x3s1_packn_rvv(const Mat& bottom_blob, Mat& top_blob, const M { Mat out = top_blob.channel(g); - vfloat32m1_t _bias0 = bias ? vle32_v_f32m1(bias + g * packn, vl) : vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _bias0 = bias ? __riscv_vle32_v_f32m1(bias + g * packn, vl) : __riscv_vfmv_v_f_f32m1(0.f, vl); const float* k0 = kernel.row(g); @@ -45,15 +45,15 @@ static void convdw3x3s1_packn_rvv(const Mat& bottom_blob, Mat& top_blob, const M const float* r2 = img0.row(2); const float* r3 = img0.row(3); - vfloat32m1_t _k00 = vle32_v_f32m1(k0, vl); - vfloat32m1_t _k01 = vle32_v_f32m1(k0 + packn, vl); - vfloat32m1_t _k02 = vle32_v_f32m1(k0 + packn * 2, vl); - vfloat32m1_t _k10 = vle32_v_f32m1(k0 + packn * 3, vl); - vfloat32m1_t _k11 = vle32_v_f32m1(k0 + packn * 4, vl); - vfloat32m1_t _k12 = vle32_v_f32m1(k0 + packn * 5, vl); - vfloat32m1_t _k20 = vle32_v_f32m1(k0 + packn * 6, vl); - vfloat32m1_t _k21 = vle32_v_f32m1(k0 + packn * 7, vl); - vfloat32m1_t _k22 = vle32_v_f32m1(k0 + packn * 8, vl); + vfloat32m1_t _k00 = __riscv_vle32_v_f32m1(k0, vl); + vfloat32m1_t _k01 = __riscv_vle32_v_f32m1(k0 + packn, vl); + vfloat32m1_t _k02 = __riscv_vle32_v_f32m1(k0 + packn * 2, vl); + vfloat32m1_t _k10 = __riscv_vle32_v_f32m1(k0 + packn * 3, vl); + vfloat32m1_t _k11 = __riscv_vle32_v_f32m1(k0 + packn * 4, vl); + vfloat32m1_t _k12 = __riscv_vle32_v_f32m1(k0 + packn * 5, vl); + vfloat32m1_t _k20 = __riscv_vle32_v_f32m1(k0 + packn * 6, vl); + vfloat32m1_t _k21 = __riscv_vle32_v_f32m1(k0 + packn * 7, vl); + vfloat32m1_t _k22 = __riscv_vle32_v_f32m1(k0 + packn * 8, vl); int i = 0; for (; i + 1 < outh; i += 2) @@ -66,70 +66,70 @@ static void convdw3x3s1_packn_rvv(const Mat& bottom_blob, Mat& top_blob, const M vfloat32m1_t _sum10 = _bias0; vfloat32m1_t _sum11 = _bias0; - vfloat32m1_t _r00 = vle32_v_f32m1(r0, vl); - vfloat32m1_t _r01 = vle32_v_f32m1(r0 + packn, vl); - vfloat32m1_t _r02 = vle32_v_f32m1(r0 + packn * 2, vl); - vfloat32m1_t _r03 = vle32_v_f32m1(r0 + packn * 3, vl); - - _sum00 = vfmacc_vv_f32m1(_sum00, _k00, _r00, vl); - _sum00 = vfmacc_vv_f32m1(_sum00, _k01, _r01, vl); - _sum00 = vfmacc_vv_f32m1(_sum00, _k02, _r02, vl); - _sum01 = vfmacc_vv_f32m1(_sum01, _k00, _r01, vl); - _sum01 = vfmacc_vv_f32m1(_sum01, _k01, _r02, vl); - _sum01 = vfmacc_vv_f32m1(_sum01, _k02, _r03, vl); - - vfloat32m1_t _r10 = vle32_v_f32m1(r1, vl); - vfloat32m1_t _r11 = vle32_v_f32m1(r1 + packn, vl); - vfloat32m1_t _r12 = vle32_v_f32m1(r1 + packn * 2, vl); - vfloat32m1_t _r13 = vle32_v_f32m1(r1 + packn * 3, vl); - - _sum00 = vfmacc_vv_f32m1(_sum00, _k10, _r10, vl); - _sum00 = vfmacc_vv_f32m1(_sum00, _k11, _r11, vl); - _sum00 = vfmacc_vv_f32m1(_sum00, _k12, _r12, vl); - _sum01 = vfmacc_vv_f32m1(_sum01, _k10, _r11, vl); - _sum01 = vfmacc_vv_f32m1(_sum01, _k11, _r12, vl); - _sum01 = vfmacc_vv_f32m1(_sum01, _k12, _r13, vl); - _sum10 = vfmacc_vv_f32m1(_sum10, _k00, _r10, vl); - _sum10 = vfmacc_vv_f32m1(_sum10, _k01, _r11, vl); - _sum10 = vfmacc_vv_f32m1(_sum10, _k02, _r12, vl); - _sum11 = vfmacc_vv_f32m1(_sum11, _k00, _r11, vl); - _sum11 = vfmacc_vv_f32m1(_sum11, _k01, _r12, vl); - _sum11 = vfmacc_vv_f32m1(_sum11, _k02, _r13, vl); - - vfloat32m1_t _r20 = vle32_v_f32m1(r2, vl); - vfloat32m1_t _r21 = vle32_v_f32m1(r2 + packn, vl); - vfloat32m1_t _r22 = vle32_v_f32m1(r2 + packn * 2, vl); - vfloat32m1_t _r23 = vle32_v_f32m1(r2 + packn * 3, vl); - - _sum00 = vfmacc_vv_f32m1(_sum00, _k20, _r20, vl); - _sum00 = vfmacc_vv_f32m1(_sum00, _k21, _r21, vl); - _sum00 = vfmacc_vv_f32m1(_sum00, _k22, _r22, vl); - _sum01 = vfmacc_vv_f32m1(_sum01, _k20, _r21, vl); - _sum01 = vfmacc_vv_f32m1(_sum01, _k21, _r22, vl); - _sum01 = vfmacc_vv_f32m1(_sum01, _k22, _r23, vl); - _sum10 = vfmacc_vv_f32m1(_sum10, _k10, _r20, vl); - _sum10 = vfmacc_vv_f32m1(_sum10, _k11, _r21, vl); - _sum10 = vfmacc_vv_f32m1(_sum10, _k12, _r22, vl); - _sum11 = vfmacc_vv_f32m1(_sum11, _k10, _r21, vl); - _sum11 = vfmacc_vv_f32m1(_sum11, _k11, _r22, vl); - _sum11 = vfmacc_vv_f32m1(_sum11, _k12, _r23, vl); - - vfloat32m1_t _r30 = vle32_v_f32m1(r3, vl); - vfloat32m1_t _r31 = vle32_v_f32m1(r3 + packn, vl); - vfloat32m1_t _r32 = vle32_v_f32m1(r3 + packn * 2, vl); - vfloat32m1_t _r33 = vle32_v_f32m1(r3 + packn * 3, vl); - - _sum10 = vfmacc_vv_f32m1(_sum10, _k20, _r30, vl); - _sum10 = vfmacc_vv_f32m1(_sum10, _k21, _r31, vl); - _sum10 = vfmacc_vv_f32m1(_sum10, _k22, _r32, vl); - _sum11 = vfmacc_vv_f32m1(_sum11, _k20, _r31, vl); - _sum11 = vfmacc_vv_f32m1(_sum11, _k21, _r32, vl); - _sum11 = vfmacc_vv_f32m1(_sum11, _k22, _r33, vl); - - vse32_v_f32m1(outptr0, _sum00, vl); - vse32_v_f32m1(outptr0 + packn, _sum01, vl); - vse32_v_f32m1(outptr1, _sum10, vl); - vse32_v_f32m1(outptr1 + packn, _sum11, vl); + vfloat32m1_t _r00 = __riscv_vle32_v_f32m1(r0, vl); + vfloat32m1_t _r01 = __riscv_vle32_v_f32m1(r0 + packn, vl); + vfloat32m1_t _r02 = __riscv_vle32_v_f32m1(r0 + packn * 2, vl); + vfloat32m1_t _r03 = __riscv_vle32_v_f32m1(r0 + packn * 3, vl); + + _sum00 = __riscv_vfmacc_vv_f32m1(_sum00, _k00, _r00, vl); + _sum00 = __riscv_vfmacc_vv_f32m1(_sum00, _k01, _r01, vl); + _sum00 = __riscv_vfmacc_vv_f32m1(_sum00, _k02, _r02, vl); + _sum01 = __riscv_vfmacc_vv_f32m1(_sum01, _k00, _r01, vl); + _sum01 = __riscv_vfmacc_vv_f32m1(_sum01, _k01, _r02, vl); + _sum01 = __riscv_vfmacc_vv_f32m1(_sum01, _k02, _r03, vl); + + vfloat32m1_t _r10 = __riscv_vle32_v_f32m1(r1, vl); + vfloat32m1_t _r11 = __riscv_vle32_v_f32m1(r1 + packn, vl); + vfloat32m1_t _r12 = __riscv_vle32_v_f32m1(r1 + packn * 2, vl); + vfloat32m1_t _r13 = __riscv_vle32_v_f32m1(r1 + packn * 3, vl); + + _sum00 = __riscv_vfmacc_vv_f32m1(_sum00, _k10, _r10, vl); + _sum00 = __riscv_vfmacc_vv_f32m1(_sum00, _k11, _r11, vl); + _sum00 = __riscv_vfmacc_vv_f32m1(_sum00, _k12, _r12, vl); + _sum01 = __riscv_vfmacc_vv_f32m1(_sum01, _k10, _r11, vl); + _sum01 = __riscv_vfmacc_vv_f32m1(_sum01, _k11, _r12, vl); + _sum01 = __riscv_vfmacc_vv_f32m1(_sum01, _k12, _r13, vl); + _sum10 = __riscv_vfmacc_vv_f32m1(_sum10, _k00, _r10, vl); + _sum10 = __riscv_vfmacc_vv_f32m1(_sum10, _k01, _r11, vl); + _sum10 = __riscv_vfmacc_vv_f32m1(_sum10, _k02, _r12, vl); + _sum11 = __riscv_vfmacc_vv_f32m1(_sum11, _k00, _r11, vl); + _sum11 = __riscv_vfmacc_vv_f32m1(_sum11, _k01, _r12, vl); + _sum11 = __riscv_vfmacc_vv_f32m1(_sum11, _k02, _r13, vl); + + vfloat32m1_t _r20 = __riscv_vle32_v_f32m1(r2, vl); + vfloat32m1_t _r21 = __riscv_vle32_v_f32m1(r2 + packn, vl); + vfloat32m1_t _r22 = __riscv_vle32_v_f32m1(r2 + packn * 2, vl); + vfloat32m1_t _r23 = __riscv_vle32_v_f32m1(r2 + packn * 3, vl); + + _sum00 = __riscv_vfmacc_vv_f32m1(_sum00, _k20, _r20, vl); + _sum00 = __riscv_vfmacc_vv_f32m1(_sum00, _k21, _r21, vl); + _sum00 = __riscv_vfmacc_vv_f32m1(_sum00, _k22, _r22, vl); + _sum01 = __riscv_vfmacc_vv_f32m1(_sum01, _k20, _r21, vl); + _sum01 = __riscv_vfmacc_vv_f32m1(_sum01, _k21, _r22, vl); + _sum01 = __riscv_vfmacc_vv_f32m1(_sum01, _k22, _r23, vl); + _sum10 = __riscv_vfmacc_vv_f32m1(_sum10, _k10, _r20, vl); + _sum10 = __riscv_vfmacc_vv_f32m1(_sum10, _k11, _r21, vl); + _sum10 = __riscv_vfmacc_vv_f32m1(_sum10, _k12, _r22, vl); + _sum11 = __riscv_vfmacc_vv_f32m1(_sum11, _k10, _r21, vl); + _sum11 = __riscv_vfmacc_vv_f32m1(_sum11, _k11, _r22, vl); + _sum11 = __riscv_vfmacc_vv_f32m1(_sum11, _k12, _r23, vl); + + vfloat32m1_t _r30 = __riscv_vle32_v_f32m1(r3, vl); + vfloat32m1_t _r31 = __riscv_vle32_v_f32m1(r3 + packn, vl); + vfloat32m1_t _r32 = __riscv_vle32_v_f32m1(r3 + packn * 2, vl); + vfloat32m1_t _r33 = __riscv_vle32_v_f32m1(r3 + packn * 3, vl); + + _sum10 = __riscv_vfmacc_vv_f32m1(_sum10, _k20, _r30, vl); + _sum10 = __riscv_vfmacc_vv_f32m1(_sum10, _k21, _r31, vl); + _sum10 = __riscv_vfmacc_vv_f32m1(_sum10, _k22, _r32, vl); + _sum11 = __riscv_vfmacc_vv_f32m1(_sum11, _k20, _r31, vl); + _sum11 = __riscv_vfmacc_vv_f32m1(_sum11, _k21, _r32, vl); + _sum11 = __riscv_vfmacc_vv_f32m1(_sum11, _k22, _r33, vl); + + __riscv_vse32_v_f32m1(outptr0, _sum00, vl); + __riscv_vse32_v_f32m1(outptr0 + packn, _sum01, vl); + __riscv_vse32_v_f32m1(outptr1, _sum10, vl); + __riscv_vse32_v_f32m1(outptr1 + packn, _sum11, vl); outptr0 += packn * 2; outptr1 += packn * 2; @@ -144,46 +144,46 @@ static void convdw3x3s1_packn_rvv(const Mat& bottom_blob, Mat& top_blob, const M vfloat32m1_t _sum0 = _bias0; vfloat32m1_t _sum1 = _bias0; - vfloat32m1_t _r00 = vle32_v_f32m1(r0, vl); - vfloat32m1_t _r01 = vle32_v_f32m1(r0 + packn, vl); - vfloat32m1_t _r02 = vle32_v_f32m1(r0 + packn * 2, vl); + vfloat32m1_t _r00 = __riscv_vle32_v_f32m1(r0, vl); + vfloat32m1_t _r01 = __riscv_vle32_v_f32m1(r0 + packn, vl); + vfloat32m1_t _r02 = __riscv_vle32_v_f32m1(r0 + packn * 2, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, _k00, _r00, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, _k01, _r01, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, _k02, _r02, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k00, _r00, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k01, _r01, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k02, _r02, vl); - vfloat32m1_t _r10 = vle32_v_f32m1(r1, vl); - vfloat32m1_t _r11 = vle32_v_f32m1(r1 + packn, vl); - vfloat32m1_t _r12 = vle32_v_f32m1(r1 + packn * 2, vl); + vfloat32m1_t _r10 = __riscv_vle32_v_f32m1(r1, vl); + vfloat32m1_t _r11 = __riscv_vle32_v_f32m1(r1 + packn, vl); + vfloat32m1_t _r12 = __riscv_vle32_v_f32m1(r1 + packn * 2, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, _k10, _r10, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, _k11, _r11, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, _k12, _r12, vl); - _sum1 = vfmacc_vv_f32m1(_sum1, _k00, _r10, vl); - _sum1 = vfmacc_vv_f32m1(_sum1, _k01, _r11, vl); - _sum1 = vfmacc_vv_f32m1(_sum1, _k02, _r12, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k10, _r10, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k11, _r11, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k12, _r12, vl); + _sum1 = __riscv_vfmacc_vv_f32m1(_sum1, _k00, _r10, vl); + _sum1 = __riscv_vfmacc_vv_f32m1(_sum1, _k01, _r11, vl); + _sum1 = __riscv_vfmacc_vv_f32m1(_sum1, _k02, _r12, vl); - vfloat32m1_t _r20 = vle32_v_f32m1(r2, vl); - vfloat32m1_t _r21 = vle32_v_f32m1(r2 + packn, vl); - vfloat32m1_t _r22 = vle32_v_f32m1(r2 + packn * 2, vl); + vfloat32m1_t _r20 = __riscv_vle32_v_f32m1(r2, vl); + vfloat32m1_t _r21 = __riscv_vle32_v_f32m1(r2 + packn, vl); + vfloat32m1_t _r22 = __riscv_vle32_v_f32m1(r2 + packn * 2, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, _k20, _r20, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, _k21, _r21, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, _k22, _r22, vl); - _sum1 = vfmacc_vv_f32m1(_sum1, _k10, _r20, vl); - _sum1 = vfmacc_vv_f32m1(_sum1, _k11, _r21, vl); - _sum1 = vfmacc_vv_f32m1(_sum1, _k12, _r22, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k20, _r20, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k21, _r21, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k22, _r22, vl); + _sum1 = __riscv_vfmacc_vv_f32m1(_sum1, _k10, _r20, vl); + _sum1 = __riscv_vfmacc_vv_f32m1(_sum1, _k11, _r21, vl); + _sum1 = __riscv_vfmacc_vv_f32m1(_sum1, _k12, _r22, vl); - vfloat32m1_t _r30 = vle32_v_f32m1(r3, vl); - vfloat32m1_t _r31 = vle32_v_f32m1(r3 + packn, vl); - vfloat32m1_t _r32 = vle32_v_f32m1(r3 + packn * 2, vl); + vfloat32m1_t _r30 = __riscv_vle32_v_f32m1(r3, vl); + vfloat32m1_t _r31 = __riscv_vle32_v_f32m1(r3 + packn, vl); + vfloat32m1_t _r32 = __riscv_vle32_v_f32m1(r3 + packn * 2, vl); - _sum1 = vfmacc_vv_f32m1(_sum1, _k20, _r30, vl); - _sum1 = vfmacc_vv_f32m1(_sum1, _k21, _r31, vl); - _sum1 = vfmacc_vv_f32m1(_sum1, _k22, _r32, vl); + _sum1 = __riscv_vfmacc_vv_f32m1(_sum1, _k20, _r30, vl); + _sum1 = __riscv_vfmacc_vv_f32m1(_sum1, _k21, _r31, vl); + _sum1 = __riscv_vfmacc_vv_f32m1(_sum1, _k22, _r32, vl); - vse32_v_f32m1(outptr0, _sum0, vl); - vse32_v_f32m1(outptr1, _sum1, vl); + __riscv_vse32_v_f32m1(outptr0, _sum0, vl); + __riscv_vse32_v_f32m1(outptr1, _sum1, vl); outptr0 += packn; outptr1 += packn; @@ -210,44 +210,44 @@ static void convdw3x3s1_packn_rvv(const Mat& bottom_blob, Mat& top_blob, const M vfloat32m1_t _sum00 = _bias0; vfloat32m1_t _sum01 = _bias0; - vfloat32m1_t _r00 = vle32_v_f32m1(r0, vl); - vfloat32m1_t _r01 = vle32_v_f32m1(r0 + packn, vl); - vfloat32m1_t _r02 = vle32_v_f32m1(r0 + packn * 2, vl); - vfloat32m1_t _r03 = vle32_v_f32m1(r0 + packn * 3, vl); - - _sum00 = vfmacc_vv_f32m1(_sum00, _k00, _r00, vl); - _sum00 = vfmacc_vv_f32m1(_sum00, _k01, _r01, vl); - _sum00 = vfmacc_vv_f32m1(_sum00, _k02, _r02, vl); - _sum01 = vfmacc_vv_f32m1(_sum01, _k00, _r01, vl); - _sum01 = vfmacc_vv_f32m1(_sum01, _k01, _r02, vl); - _sum01 = vfmacc_vv_f32m1(_sum01, _k02, _r03, vl); - - vfloat32m1_t _r10 = vle32_v_f32m1(r1, vl); - vfloat32m1_t _r11 = vle32_v_f32m1(r1 + packn, vl); - vfloat32m1_t _r12 = vle32_v_f32m1(r1 + packn * 2, vl); - vfloat32m1_t _r13 = vle32_v_f32m1(r1 + packn * 3, vl); - - _sum00 = vfmacc_vv_f32m1(_sum00, _k10, _r10, vl); - _sum00 = vfmacc_vv_f32m1(_sum00, _k11, _r11, vl); - _sum00 = vfmacc_vv_f32m1(_sum00, _k12, _r12, vl); - _sum01 = vfmacc_vv_f32m1(_sum01, _k10, _r11, vl); - _sum01 = vfmacc_vv_f32m1(_sum01, _k11, _r12, vl); - _sum01 = vfmacc_vv_f32m1(_sum01, _k12, _r13, vl); - - vfloat32m1_t _r20 = vle32_v_f32m1(r2, vl); - vfloat32m1_t _r21 = vle32_v_f32m1(r2 + packn, vl); - vfloat32m1_t _r22 = vle32_v_f32m1(r2 + packn * 2, vl); - vfloat32m1_t _r23 = vle32_v_f32m1(r2 + packn * 3, vl); - - _sum00 = vfmacc_vv_f32m1(_sum00, _k20, _r20, vl); - _sum00 = vfmacc_vv_f32m1(_sum00, _k21, _r21, vl); - _sum00 = vfmacc_vv_f32m1(_sum00, _k22, _r22, vl); - _sum01 = vfmacc_vv_f32m1(_sum01, _k20, _r21, vl); - _sum01 = vfmacc_vv_f32m1(_sum01, _k21, _r22, vl); - _sum01 = vfmacc_vv_f32m1(_sum01, _k22, _r23, vl); - - vse32_v_f32m1(outptr0, _sum00, vl); - vse32_v_f32m1(outptr0 + packn, _sum01, vl); + vfloat32m1_t _r00 = __riscv_vle32_v_f32m1(r0, vl); + vfloat32m1_t _r01 = __riscv_vle32_v_f32m1(r0 + packn, vl); + vfloat32m1_t _r02 = __riscv_vle32_v_f32m1(r0 + packn * 2, vl); + vfloat32m1_t _r03 = __riscv_vle32_v_f32m1(r0 + packn * 3, vl); + + _sum00 = __riscv_vfmacc_vv_f32m1(_sum00, _k00, _r00, vl); + _sum00 = __riscv_vfmacc_vv_f32m1(_sum00, _k01, _r01, vl); + _sum00 = __riscv_vfmacc_vv_f32m1(_sum00, _k02, _r02, vl); + _sum01 = __riscv_vfmacc_vv_f32m1(_sum01, _k00, _r01, vl); + _sum01 = __riscv_vfmacc_vv_f32m1(_sum01, _k01, _r02, vl); + _sum01 = __riscv_vfmacc_vv_f32m1(_sum01, _k02, _r03, vl); + + vfloat32m1_t _r10 = __riscv_vle32_v_f32m1(r1, vl); + vfloat32m1_t _r11 = __riscv_vle32_v_f32m1(r1 + packn, vl); + vfloat32m1_t _r12 = __riscv_vle32_v_f32m1(r1 + packn * 2, vl); + vfloat32m1_t _r13 = __riscv_vle32_v_f32m1(r1 + packn * 3, vl); + + _sum00 = __riscv_vfmacc_vv_f32m1(_sum00, _k10, _r10, vl); + _sum00 = __riscv_vfmacc_vv_f32m1(_sum00, _k11, _r11, vl); + _sum00 = __riscv_vfmacc_vv_f32m1(_sum00, _k12, _r12, vl); + _sum01 = __riscv_vfmacc_vv_f32m1(_sum01, _k10, _r11, vl); + _sum01 = __riscv_vfmacc_vv_f32m1(_sum01, _k11, _r12, vl); + _sum01 = __riscv_vfmacc_vv_f32m1(_sum01, _k12, _r13, vl); + + vfloat32m1_t _r20 = __riscv_vle32_v_f32m1(r2, vl); + vfloat32m1_t _r21 = __riscv_vle32_v_f32m1(r2 + packn, vl); + vfloat32m1_t _r22 = __riscv_vle32_v_f32m1(r2 + packn * 2, vl); + vfloat32m1_t _r23 = __riscv_vle32_v_f32m1(r2 + packn * 3, vl); + + _sum00 = __riscv_vfmacc_vv_f32m1(_sum00, _k20, _r20, vl); + _sum00 = __riscv_vfmacc_vv_f32m1(_sum00, _k21, _r21, vl); + _sum00 = __riscv_vfmacc_vv_f32m1(_sum00, _k22, _r22, vl); + _sum01 = __riscv_vfmacc_vv_f32m1(_sum01, _k20, _r21, vl); + _sum01 = __riscv_vfmacc_vv_f32m1(_sum01, _k21, _r22, vl); + _sum01 = __riscv_vfmacc_vv_f32m1(_sum01, _k22, _r23, vl); + + __riscv_vse32_v_f32m1(outptr0, _sum00, vl); + __riscv_vse32_v_f32m1(outptr0 + packn, _sum01, vl); outptr0 += packn * 2; @@ -259,31 +259,31 @@ static void convdw3x3s1_packn_rvv(const Mat& bottom_blob, Mat& top_blob, const M { vfloat32m1_t _sum0 = _bias0; - vfloat32m1_t _r00 = vle32_v_f32m1(r0, vl); - vfloat32m1_t _r01 = vle32_v_f32m1(r0 + packn, vl); - vfloat32m1_t _r02 = vle32_v_f32m1(r0 + packn * 2, vl); + vfloat32m1_t _r00 = __riscv_vle32_v_f32m1(r0, vl); + vfloat32m1_t _r01 = __riscv_vle32_v_f32m1(r0 + packn, vl); + vfloat32m1_t _r02 = __riscv_vle32_v_f32m1(r0 + packn * 2, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, _k00, _r00, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, _k01, _r01, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, _k02, _r02, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k00, _r00, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k01, _r01, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k02, _r02, vl); - vfloat32m1_t _r10 = vle32_v_f32m1(r1, vl); - vfloat32m1_t _r11 = vle32_v_f32m1(r1 + packn, vl); - vfloat32m1_t _r12 = vle32_v_f32m1(r1 + packn * 2, vl); + vfloat32m1_t _r10 = __riscv_vle32_v_f32m1(r1, vl); + vfloat32m1_t _r11 = __riscv_vle32_v_f32m1(r1 + packn, vl); + vfloat32m1_t _r12 = __riscv_vle32_v_f32m1(r1 + packn * 2, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, _k10, _r10, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, _k11, _r11, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, _k12, _r12, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k10, _r10, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k11, _r11, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k12, _r12, vl); - vfloat32m1_t _r20 = vle32_v_f32m1(r2, vl); - vfloat32m1_t _r21 = vle32_v_f32m1(r2 + packn, vl); - vfloat32m1_t _r22 = vle32_v_f32m1(r2 + packn * 2, vl); + vfloat32m1_t _r20 = __riscv_vle32_v_f32m1(r2, vl); + vfloat32m1_t _r21 = __riscv_vle32_v_f32m1(r2 + packn, vl); + vfloat32m1_t _r22 = __riscv_vle32_v_f32m1(r2 + packn * 2, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, _k20, _r20, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, _k21, _r21, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, _k22, _r22, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k20, _r20, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k21, _r21, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k22, _r22, vl); - vse32_v_f32m1(outptr0, _sum0, vl); + __riscv_vse32_v_f32m1(outptr0, _sum0, vl); outptr0 += packn; @@ -302,7 +302,7 @@ static void convdw3x3s1_packn_rvv(const Mat& bottom_blob, Mat& top_blob, const M static void convdw3x3s2_packn_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) { const int packn = csrr_vlenb() / 4; - const size_t vl = vsetvl_e32m1(packn); + const size_t vl = __riscv_vsetvl_e32m1(packn); int w = bottom_blob.w; @@ -320,7 +320,7 @@ static void convdw3x3s2_packn_rvv(const Mat& bottom_blob, Mat& top_blob, const M { Mat out = top_blob.channel(g); - vfloat32m1_t _bias0 = bias ? vle32_v_f32m1(bias + g * packn, vl) : vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _bias0 = bias ? __riscv_vle32_v_f32m1(bias + g * packn, vl) : __riscv_vfmv_v_f_f32m1(0.f, vl); const float* k0 = kernel.row(g); @@ -332,15 +332,15 @@ static void convdw3x3s2_packn_rvv(const Mat& bottom_blob, Mat& top_blob, const M const float* r1 = img0.row(1); const float* r2 = img0.row(2); - vfloat32m1_t _k00 = vle32_v_f32m1(k0, vl); - vfloat32m1_t _k01 = vle32_v_f32m1(k0 + packn, vl); - vfloat32m1_t _k02 = vle32_v_f32m1(k0 + packn * 2, vl); - vfloat32m1_t _k10 = vle32_v_f32m1(k0 + packn * 3, vl); - vfloat32m1_t _k11 = vle32_v_f32m1(k0 + packn * 4, vl); - vfloat32m1_t _k12 = vle32_v_f32m1(k0 + packn * 5, vl); - vfloat32m1_t _k20 = vle32_v_f32m1(k0 + packn * 6, vl); - vfloat32m1_t _k21 = vle32_v_f32m1(k0 + packn * 7, vl); - vfloat32m1_t _k22 = vle32_v_f32m1(k0 + packn * 8, vl); + vfloat32m1_t _k00 = __riscv_vle32_v_f32m1(k0, vl); + vfloat32m1_t _k01 = __riscv_vle32_v_f32m1(k0 + packn, vl); + vfloat32m1_t _k02 = __riscv_vle32_v_f32m1(k0 + packn * 2, vl); + vfloat32m1_t _k10 = __riscv_vle32_v_f32m1(k0 + packn * 3, vl); + vfloat32m1_t _k11 = __riscv_vle32_v_f32m1(k0 + packn * 4, vl); + vfloat32m1_t _k12 = __riscv_vle32_v_f32m1(k0 + packn * 5, vl); + vfloat32m1_t _k20 = __riscv_vle32_v_f32m1(k0 + packn * 6, vl); + vfloat32m1_t _k21 = __riscv_vle32_v_f32m1(k0 + packn * 7, vl); + vfloat32m1_t _k22 = __riscv_vle32_v_f32m1(k0 + packn * 8, vl); int i = 0; for (; i < outh; i++) @@ -351,47 +351,47 @@ static void convdw3x3s2_packn_rvv(const Mat& bottom_blob, Mat& top_blob, const M vfloat32m1_t _sum00 = _bias0; vfloat32m1_t _sum01 = _bias0; - vfloat32m1_t _r00 = vle32_v_f32m1(r0, vl); - vfloat32m1_t _r01 = vle32_v_f32m1(r0 + packn, vl); - vfloat32m1_t _r02 = vle32_v_f32m1(r0 + packn * 2, vl); - vfloat32m1_t _r03 = vle32_v_f32m1(r0 + packn * 3, vl); - vfloat32m1_t _r04 = vle32_v_f32m1(r0 + packn * 4, vl); - - _sum00 = vfmacc_vv_f32m1(_sum00, _k00, _r00, vl); - _sum00 = vfmacc_vv_f32m1(_sum00, _k01, _r01, vl); - _sum00 = vfmacc_vv_f32m1(_sum00, _k02, _r02, vl); - _sum01 = vfmacc_vv_f32m1(_sum01, _k00, _r02, vl); - _sum01 = vfmacc_vv_f32m1(_sum01, _k01, _r03, vl); - _sum01 = vfmacc_vv_f32m1(_sum01, _k02, _r04, vl); - - vfloat32m1_t _r10 = vle32_v_f32m1(r1, vl); - vfloat32m1_t _r11 = vle32_v_f32m1(r1 + packn, vl); - vfloat32m1_t _r12 = vle32_v_f32m1(r1 + packn * 2, vl); - vfloat32m1_t _r13 = vle32_v_f32m1(r1 + packn * 3, vl); - vfloat32m1_t _r14 = vle32_v_f32m1(r1 + packn * 4, vl); - - _sum00 = vfmacc_vv_f32m1(_sum00, _k10, _r10, vl); - _sum00 = vfmacc_vv_f32m1(_sum00, _k11, _r11, vl); - _sum00 = vfmacc_vv_f32m1(_sum00, _k12, _r12, vl); - _sum01 = vfmacc_vv_f32m1(_sum01, _k10, _r12, vl); - _sum01 = vfmacc_vv_f32m1(_sum01, _k11, _r13, vl); - _sum01 = vfmacc_vv_f32m1(_sum01, _k12, _r14, vl); - - vfloat32m1_t _r20 = vle32_v_f32m1(r2, vl); - vfloat32m1_t _r21 = vle32_v_f32m1(r2 + packn, vl); - vfloat32m1_t _r22 = vle32_v_f32m1(r2 + packn * 2, vl); - vfloat32m1_t _r23 = vle32_v_f32m1(r2 + packn * 3, vl); - vfloat32m1_t _r24 = vle32_v_f32m1(r2 + packn * 4, vl); - - _sum00 = vfmacc_vv_f32m1(_sum00, _k20, _r20, vl); - _sum00 = vfmacc_vv_f32m1(_sum00, _k21, _r21, vl); - _sum00 = vfmacc_vv_f32m1(_sum00, _k22, _r22, vl); - _sum01 = vfmacc_vv_f32m1(_sum01, _k20, _r22, vl); - _sum01 = vfmacc_vv_f32m1(_sum01, _k21, _r23, vl); - _sum01 = vfmacc_vv_f32m1(_sum01, _k22, _r24, vl); - - vse32_v_f32m1(outptr0, _sum00, vl); - vse32_v_f32m1(outptr0 + packn, _sum01, vl); + vfloat32m1_t _r00 = __riscv_vle32_v_f32m1(r0, vl); + vfloat32m1_t _r01 = __riscv_vle32_v_f32m1(r0 + packn, vl); + vfloat32m1_t _r02 = __riscv_vle32_v_f32m1(r0 + packn * 2, vl); + vfloat32m1_t _r03 = __riscv_vle32_v_f32m1(r0 + packn * 3, vl); + vfloat32m1_t _r04 = __riscv_vle32_v_f32m1(r0 + packn * 4, vl); + + _sum00 = __riscv_vfmacc_vv_f32m1(_sum00, _k00, _r00, vl); + _sum00 = __riscv_vfmacc_vv_f32m1(_sum00, _k01, _r01, vl); + _sum00 = __riscv_vfmacc_vv_f32m1(_sum00, _k02, _r02, vl); + _sum01 = __riscv_vfmacc_vv_f32m1(_sum01, _k00, _r02, vl); + _sum01 = __riscv_vfmacc_vv_f32m1(_sum01, _k01, _r03, vl); + _sum01 = __riscv_vfmacc_vv_f32m1(_sum01, _k02, _r04, vl); + + vfloat32m1_t _r10 = __riscv_vle32_v_f32m1(r1, vl); + vfloat32m1_t _r11 = __riscv_vle32_v_f32m1(r1 + packn, vl); + vfloat32m1_t _r12 = __riscv_vle32_v_f32m1(r1 + packn * 2, vl); + vfloat32m1_t _r13 = __riscv_vle32_v_f32m1(r1 + packn * 3, vl); + vfloat32m1_t _r14 = __riscv_vle32_v_f32m1(r1 + packn * 4, vl); + + _sum00 = __riscv_vfmacc_vv_f32m1(_sum00, _k10, _r10, vl); + _sum00 = __riscv_vfmacc_vv_f32m1(_sum00, _k11, _r11, vl); + _sum00 = __riscv_vfmacc_vv_f32m1(_sum00, _k12, _r12, vl); + _sum01 = __riscv_vfmacc_vv_f32m1(_sum01, _k10, _r12, vl); + _sum01 = __riscv_vfmacc_vv_f32m1(_sum01, _k11, _r13, vl); + _sum01 = __riscv_vfmacc_vv_f32m1(_sum01, _k12, _r14, vl); + + vfloat32m1_t _r20 = __riscv_vle32_v_f32m1(r2, vl); + vfloat32m1_t _r21 = __riscv_vle32_v_f32m1(r2 + packn, vl); + vfloat32m1_t _r22 = __riscv_vle32_v_f32m1(r2 + packn * 2, vl); + vfloat32m1_t _r23 = __riscv_vle32_v_f32m1(r2 + packn * 3, vl); + vfloat32m1_t _r24 = __riscv_vle32_v_f32m1(r2 + packn * 4, vl); + + _sum00 = __riscv_vfmacc_vv_f32m1(_sum00, _k20, _r20, vl); + _sum00 = __riscv_vfmacc_vv_f32m1(_sum00, _k21, _r21, vl); + _sum00 = __riscv_vfmacc_vv_f32m1(_sum00, _k22, _r22, vl); + _sum01 = __riscv_vfmacc_vv_f32m1(_sum01, _k20, _r22, vl); + _sum01 = __riscv_vfmacc_vv_f32m1(_sum01, _k21, _r23, vl); + _sum01 = __riscv_vfmacc_vv_f32m1(_sum01, _k22, _r24, vl); + + __riscv_vse32_v_f32m1(outptr0, _sum00, vl); + __riscv_vse32_v_f32m1(outptr0 + packn, _sum01, vl); outptr0 += packn * 2; @@ -403,31 +403,31 @@ static void convdw3x3s2_packn_rvv(const Mat& bottom_blob, Mat& top_blob, const M { vfloat32m1_t _sum0 = _bias0; - vfloat32m1_t _r00 = vle32_v_f32m1(r0, vl); - vfloat32m1_t _r01 = vle32_v_f32m1(r0 + packn, vl); - vfloat32m1_t _r02 = vle32_v_f32m1(r0 + packn * 2, vl); + vfloat32m1_t _r00 = __riscv_vle32_v_f32m1(r0, vl); + vfloat32m1_t _r01 = __riscv_vle32_v_f32m1(r0 + packn, vl); + vfloat32m1_t _r02 = __riscv_vle32_v_f32m1(r0 + packn * 2, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, _k00, _r00, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, _k01, _r01, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, _k02, _r02, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k00, _r00, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k01, _r01, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k02, _r02, vl); - vfloat32m1_t _r10 = vle32_v_f32m1(r1, vl); - vfloat32m1_t _r11 = vle32_v_f32m1(r1 + packn, vl); - vfloat32m1_t _r12 = vle32_v_f32m1(r1 + packn * 2, vl); + vfloat32m1_t _r10 = __riscv_vle32_v_f32m1(r1, vl); + vfloat32m1_t _r11 = __riscv_vle32_v_f32m1(r1 + packn, vl); + vfloat32m1_t _r12 = __riscv_vle32_v_f32m1(r1 + packn * 2, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, _k10, _r10, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, _k11, _r11, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, _k12, _r12, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k10, _r10, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k11, _r11, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k12, _r12, vl); - vfloat32m1_t _r20 = vle32_v_f32m1(r2, vl); - vfloat32m1_t _r21 = vle32_v_f32m1(r2 + packn, vl); - vfloat32m1_t _r22 = vle32_v_f32m1(r2 + packn * 2, vl); + vfloat32m1_t _r20 = __riscv_vle32_v_f32m1(r2, vl); + vfloat32m1_t _r21 = __riscv_vle32_v_f32m1(r2 + packn, vl); + vfloat32m1_t _r22 = __riscv_vle32_v_f32m1(r2 + packn * 2, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, _k20, _r20, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, _k21, _r21, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, _k22, _r22, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k20, _r20, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k21, _r21, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k22, _r22, vl); - vse32_v_f32m1(outptr0, _sum0, vl); + __riscv_vse32_v_f32m1(outptr0, _sum0, vl); outptr0 += packn; diff --git a/src/layer/riscv/convolutiondepthwise_3x3_packn_fp16s.h b/src/layer/riscv/convolutiondepthwise_3x3_packn_fp16s.h index d479385f6a2..6493c50c040 100644 --- a/src/layer/riscv/convolutiondepthwise_3x3_packn_fp16s.h +++ b/src/layer/riscv/convolutiondepthwise_3x3_packn_fp16s.h @@ -15,7 +15,7 @@ static void convdw3x3s1_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) { const int packn = csrr_vlenb() / 2; - const size_t vl = vsetvl_e16m1(packn); + const size_t vl = __riscv_vsetvl_e16m1(packn); int w = bottom_blob.w; @@ -31,7 +31,7 @@ static void convdw3x3s1_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, { Mat out = top_blob.channel(g); - vfloat16m1_t _bias0 = bias ? vle16_v_f16m1(bias + g * packn, vl) : vfmv_v_f_f16m1((__fp16)0.f, vl); + vfloat16m1_t _bias0 = bias ? __riscv_vle16_v_f16m1(bias + g * packn, vl) : __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl); const __fp16* k0 = kernel.row(g); @@ -45,15 +45,15 @@ static void convdw3x3s1_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, const __fp16* r2 = img0.row(2); const __fp16* r3 = img0.row(3); - vfloat16m1_t _k00 = vle16_v_f16m1(k0, vl); - vfloat16m1_t _k01 = vle16_v_f16m1(k0 + packn, vl); - vfloat16m1_t _k02 = vle16_v_f16m1(k0 + packn * 2, vl); - vfloat16m1_t _k10 = vle16_v_f16m1(k0 + packn * 3, vl); - vfloat16m1_t _k11 = vle16_v_f16m1(k0 + packn * 4, vl); - vfloat16m1_t _k12 = vle16_v_f16m1(k0 + packn * 5, vl); - vfloat16m1_t _k20 = vle16_v_f16m1(k0 + packn * 6, vl); - vfloat16m1_t _k21 = vle16_v_f16m1(k0 + packn * 7, vl); - vfloat16m1_t _k22 = vle16_v_f16m1(k0 + packn * 8, vl); + vfloat16m1_t _k00 = __riscv_vle16_v_f16m1(k0, vl); + vfloat16m1_t _k01 = __riscv_vle16_v_f16m1(k0 + packn, vl); + vfloat16m1_t _k02 = __riscv_vle16_v_f16m1(k0 + packn * 2, vl); + vfloat16m1_t _k10 = __riscv_vle16_v_f16m1(k0 + packn * 3, vl); + vfloat16m1_t _k11 = __riscv_vle16_v_f16m1(k0 + packn * 4, vl); + vfloat16m1_t _k12 = __riscv_vle16_v_f16m1(k0 + packn * 5, vl); + vfloat16m1_t _k20 = __riscv_vle16_v_f16m1(k0 + packn * 6, vl); + vfloat16m1_t _k21 = __riscv_vle16_v_f16m1(k0 + packn * 7, vl); + vfloat16m1_t _k22 = __riscv_vle16_v_f16m1(k0 + packn * 8, vl); int i = 0; for (; i + 1 < outh; i += 2) @@ -66,70 +66,70 @@ static void convdw3x3s1_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, vfloat16m1_t _sum10 = _bias0; vfloat16m1_t _sum11 = _bias0; - vfloat16m1_t _r00 = vle16_v_f16m1(r0, vl); - vfloat16m1_t _r01 = vle16_v_f16m1(r0 + packn, vl); - vfloat16m1_t _r02 = vle16_v_f16m1(r0 + packn * 2, vl); - vfloat16m1_t _r03 = vle16_v_f16m1(r0 + packn * 3, vl); - - _sum00 = vfmacc_vv_f16m1(_sum00, _k00, _r00, vl); - _sum00 = vfmacc_vv_f16m1(_sum00, _k01, _r01, vl); - _sum00 = vfmacc_vv_f16m1(_sum00, _k02, _r02, vl); - _sum01 = vfmacc_vv_f16m1(_sum01, _k00, _r01, vl); - _sum01 = vfmacc_vv_f16m1(_sum01, _k01, _r02, vl); - _sum01 = vfmacc_vv_f16m1(_sum01, _k02, _r03, vl); - - vfloat16m1_t _r10 = vle16_v_f16m1(r1, vl); - vfloat16m1_t _r11 = vle16_v_f16m1(r1 + packn, vl); - vfloat16m1_t _r12 = vle16_v_f16m1(r1 + packn * 2, vl); - vfloat16m1_t _r13 = vle16_v_f16m1(r1 + packn * 3, vl); - - _sum00 = vfmacc_vv_f16m1(_sum00, _k10, _r10, vl); - _sum00 = vfmacc_vv_f16m1(_sum00, _k11, _r11, vl); - _sum00 = vfmacc_vv_f16m1(_sum00, _k12, _r12, vl); - _sum01 = vfmacc_vv_f16m1(_sum01, _k10, _r11, vl); - _sum01 = vfmacc_vv_f16m1(_sum01, _k11, _r12, vl); - _sum01 = vfmacc_vv_f16m1(_sum01, _k12, _r13, vl); - _sum10 = vfmacc_vv_f16m1(_sum10, _k00, _r10, vl); - _sum10 = vfmacc_vv_f16m1(_sum10, _k01, _r11, vl); - _sum10 = vfmacc_vv_f16m1(_sum10, _k02, _r12, vl); - _sum11 = vfmacc_vv_f16m1(_sum11, _k00, _r11, vl); - _sum11 = vfmacc_vv_f16m1(_sum11, _k01, _r12, vl); - _sum11 = vfmacc_vv_f16m1(_sum11, _k02, _r13, vl); - - vfloat16m1_t _r20 = vle16_v_f16m1(r2, vl); - vfloat16m1_t _r21 = vle16_v_f16m1(r2 + packn, vl); - vfloat16m1_t _r22 = vle16_v_f16m1(r2 + packn * 2, vl); - vfloat16m1_t _r23 = vle16_v_f16m1(r2 + packn * 3, vl); - - _sum00 = vfmacc_vv_f16m1(_sum00, _k20, _r20, vl); - _sum00 = vfmacc_vv_f16m1(_sum00, _k21, _r21, vl); - _sum00 = vfmacc_vv_f16m1(_sum00, _k22, _r22, vl); - _sum01 = vfmacc_vv_f16m1(_sum01, _k20, _r21, vl); - _sum01 = vfmacc_vv_f16m1(_sum01, _k21, _r22, vl); - _sum01 = vfmacc_vv_f16m1(_sum01, _k22, _r23, vl); - _sum10 = vfmacc_vv_f16m1(_sum10, _k10, _r20, vl); - _sum10 = vfmacc_vv_f16m1(_sum10, _k11, _r21, vl); - _sum10 = vfmacc_vv_f16m1(_sum10, _k12, _r22, vl); - _sum11 = vfmacc_vv_f16m1(_sum11, _k10, _r21, vl); - _sum11 = vfmacc_vv_f16m1(_sum11, _k11, _r22, vl); - _sum11 = vfmacc_vv_f16m1(_sum11, _k12, _r23, vl); - - vfloat16m1_t _r30 = vle16_v_f16m1(r3, vl); - vfloat16m1_t _r31 = vle16_v_f16m1(r3 + packn, vl); - vfloat16m1_t _r32 = vle16_v_f16m1(r3 + packn * 2, vl); - vfloat16m1_t _r33 = vle16_v_f16m1(r3 + packn * 3, vl); - - _sum10 = vfmacc_vv_f16m1(_sum10, _k20, _r30, vl); - _sum10 = vfmacc_vv_f16m1(_sum10, _k21, _r31, vl); - _sum10 = vfmacc_vv_f16m1(_sum10, _k22, _r32, vl); - _sum11 = vfmacc_vv_f16m1(_sum11, _k20, _r31, vl); - _sum11 = vfmacc_vv_f16m1(_sum11, _k21, _r32, vl); - _sum11 = vfmacc_vv_f16m1(_sum11, _k22, _r33, vl); - - vse16_v_f16m1(outptr0, _sum00, vl); - vse16_v_f16m1(outptr0 + packn, _sum01, vl); - vse16_v_f16m1(outptr1, _sum10, vl); - vse16_v_f16m1(outptr1 + packn, _sum11, vl); + vfloat16m1_t _r00 = __riscv_vle16_v_f16m1(r0, vl); + vfloat16m1_t _r01 = __riscv_vle16_v_f16m1(r0 + packn, vl); + vfloat16m1_t _r02 = __riscv_vle16_v_f16m1(r0 + packn * 2, vl); + vfloat16m1_t _r03 = __riscv_vle16_v_f16m1(r0 + packn * 3, vl); + + _sum00 = __riscv_vfmacc_vv_f16m1(_sum00, _k00, _r00, vl); + _sum00 = __riscv_vfmacc_vv_f16m1(_sum00, _k01, _r01, vl); + _sum00 = __riscv_vfmacc_vv_f16m1(_sum00, _k02, _r02, vl); + _sum01 = __riscv_vfmacc_vv_f16m1(_sum01, _k00, _r01, vl); + _sum01 = __riscv_vfmacc_vv_f16m1(_sum01, _k01, _r02, vl); + _sum01 = __riscv_vfmacc_vv_f16m1(_sum01, _k02, _r03, vl); + + vfloat16m1_t _r10 = __riscv_vle16_v_f16m1(r1, vl); + vfloat16m1_t _r11 = __riscv_vle16_v_f16m1(r1 + packn, vl); + vfloat16m1_t _r12 = __riscv_vle16_v_f16m1(r1 + packn * 2, vl); + vfloat16m1_t _r13 = __riscv_vle16_v_f16m1(r1 + packn * 3, vl); + + _sum00 = __riscv_vfmacc_vv_f16m1(_sum00, _k10, _r10, vl); + _sum00 = __riscv_vfmacc_vv_f16m1(_sum00, _k11, _r11, vl); + _sum00 = __riscv_vfmacc_vv_f16m1(_sum00, _k12, _r12, vl); + _sum01 = __riscv_vfmacc_vv_f16m1(_sum01, _k10, _r11, vl); + _sum01 = __riscv_vfmacc_vv_f16m1(_sum01, _k11, _r12, vl); + _sum01 = __riscv_vfmacc_vv_f16m1(_sum01, _k12, _r13, vl); + _sum10 = __riscv_vfmacc_vv_f16m1(_sum10, _k00, _r10, vl); + _sum10 = __riscv_vfmacc_vv_f16m1(_sum10, _k01, _r11, vl); + _sum10 = __riscv_vfmacc_vv_f16m1(_sum10, _k02, _r12, vl); + _sum11 = __riscv_vfmacc_vv_f16m1(_sum11, _k00, _r11, vl); + _sum11 = __riscv_vfmacc_vv_f16m1(_sum11, _k01, _r12, vl); + _sum11 = __riscv_vfmacc_vv_f16m1(_sum11, _k02, _r13, vl); + + vfloat16m1_t _r20 = __riscv_vle16_v_f16m1(r2, vl); + vfloat16m1_t _r21 = __riscv_vle16_v_f16m1(r2 + packn, vl); + vfloat16m1_t _r22 = __riscv_vle16_v_f16m1(r2 + packn * 2, vl); + vfloat16m1_t _r23 = __riscv_vle16_v_f16m1(r2 + packn * 3, vl); + + _sum00 = __riscv_vfmacc_vv_f16m1(_sum00, _k20, _r20, vl); + _sum00 = __riscv_vfmacc_vv_f16m1(_sum00, _k21, _r21, vl); + _sum00 = __riscv_vfmacc_vv_f16m1(_sum00, _k22, _r22, vl); + _sum01 = __riscv_vfmacc_vv_f16m1(_sum01, _k20, _r21, vl); + _sum01 = __riscv_vfmacc_vv_f16m1(_sum01, _k21, _r22, vl); + _sum01 = __riscv_vfmacc_vv_f16m1(_sum01, _k22, _r23, vl); + _sum10 = __riscv_vfmacc_vv_f16m1(_sum10, _k10, _r20, vl); + _sum10 = __riscv_vfmacc_vv_f16m1(_sum10, _k11, _r21, vl); + _sum10 = __riscv_vfmacc_vv_f16m1(_sum10, _k12, _r22, vl); + _sum11 = __riscv_vfmacc_vv_f16m1(_sum11, _k10, _r21, vl); + _sum11 = __riscv_vfmacc_vv_f16m1(_sum11, _k11, _r22, vl); + _sum11 = __riscv_vfmacc_vv_f16m1(_sum11, _k12, _r23, vl); + + vfloat16m1_t _r30 = __riscv_vle16_v_f16m1(r3, vl); + vfloat16m1_t _r31 = __riscv_vle16_v_f16m1(r3 + packn, vl); + vfloat16m1_t _r32 = __riscv_vle16_v_f16m1(r3 + packn * 2, vl); + vfloat16m1_t _r33 = __riscv_vle16_v_f16m1(r3 + packn * 3, vl); + + _sum10 = __riscv_vfmacc_vv_f16m1(_sum10, _k20, _r30, vl); + _sum10 = __riscv_vfmacc_vv_f16m1(_sum10, _k21, _r31, vl); + _sum10 = __riscv_vfmacc_vv_f16m1(_sum10, _k22, _r32, vl); + _sum11 = __riscv_vfmacc_vv_f16m1(_sum11, _k20, _r31, vl); + _sum11 = __riscv_vfmacc_vv_f16m1(_sum11, _k21, _r32, vl); + _sum11 = __riscv_vfmacc_vv_f16m1(_sum11, _k22, _r33, vl); + + __riscv_vse16_v_f16m1(outptr0, _sum00, vl); + __riscv_vse16_v_f16m1(outptr0 + packn, _sum01, vl); + __riscv_vse16_v_f16m1(outptr1, _sum10, vl); + __riscv_vse16_v_f16m1(outptr1 + packn, _sum11, vl); outptr0 += packn * 2; outptr1 += packn * 2; @@ -144,46 +144,46 @@ static void convdw3x3s1_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, vfloat16m1_t _sum0 = _bias0; vfloat16m1_t _sum1 = _bias0; - vfloat16m1_t _r00 = vle16_v_f16m1(r0, vl); - vfloat16m1_t _r01 = vle16_v_f16m1(r0 + packn, vl); - vfloat16m1_t _r02 = vle16_v_f16m1(r0 + packn * 2, vl); + vfloat16m1_t _r00 = __riscv_vle16_v_f16m1(r0, vl); + vfloat16m1_t _r01 = __riscv_vle16_v_f16m1(r0 + packn, vl); + vfloat16m1_t _r02 = __riscv_vle16_v_f16m1(r0 + packn * 2, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, _k00, _r00, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, _k01, _r01, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, _k02, _r02, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k00, _r00, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k01, _r01, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k02, _r02, vl); - vfloat16m1_t _r10 = vle16_v_f16m1(r1, vl); - vfloat16m1_t _r11 = vle16_v_f16m1(r1 + packn, vl); - vfloat16m1_t _r12 = vle16_v_f16m1(r1 + packn * 2, vl); + vfloat16m1_t _r10 = __riscv_vle16_v_f16m1(r1, vl); + vfloat16m1_t _r11 = __riscv_vle16_v_f16m1(r1 + packn, vl); + vfloat16m1_t _r12 = __riscv_vle16_v_f16m1(r1 + packn * 2, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, _k10, _r10, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, _k11, _r11, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, _k12, _r12, vl); - _sum1 = vfmacc_vv_f16m1(_sum1, _k00, _r10, vl); - _sum1 = vfmacc_vv_f16m1(_sum1, _k01, _r11, vl); - _sum1 = vfmacc_vv_f16m1(_sum1, _k02, _r12, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k10, _r10, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k11, _r11, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k12, _r12, vl); + _sum1 = __riscv_vfmacc_vv_f16m1(_sum1, _k00, _r10, vl); + _sum1 = __riscv_vfmacc_vv_f16m1(_sum1, _k01, _r11, vl); + _sum1 = __riscv_vfmacc_vv_f16m1(_sum1, _k02, _r12, vl); - vfloat16m1_t _r20 = vle16_v_f16m1(r2, vl); - vfloat16m1_t _r21 = vle16_v_f16m1(r2 + packn, vl); - vfloat16m1_t _r22 = vle16_v_f16m1(r2 + packn * 2, vl); + vfloat16m1_t _r20 = __riscv_vle16_v_f16m1(r2, vl); + vfloat16m1_t _r21 = __riscv_vle16_v_f16m1(r2 + packn, vl); + vfloat16m1_t _r22 = __riscv_vle16_v_f16m1(r2 + packn * 2, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, _k20, _r20, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, _k21, _r21, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, _k22, _r22, vl); - _sum1 = vfmacc_vv_f16m1(_sum1, _k10, _r20, vl); - _sum1 = vfmacc_vv_f16m1(_sum1, _k11, _r21, vl); - _sum1 = vfmacc_vv_f16m1(_sum1, _k12, _r22, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k20, _r20, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k21, _r21, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k22, _r22, vl); + _sum1 = __riscv_vfmacc_vv_f16m1(_sum1, _k10, _r20, vl); + _sum1 = __riscv_vfmacc_vv_f16m1(_sum1, _k11, _r21, vl); + _sum1 = __riscv_vfmacc_vv_f16m1(_sum1, _k12, _r22, vl); - vfloat16m1_t _r30 = vle16_v_f16m1(r3, vl); - vfloat16m1_t _r31 = vle16_v_f16m1(r3 + packn, vl); - vfloat16m1_t _r32 = vle16_v_f16m1(r3 + packn * 2, vl); + vfloat16m1_t _r30 = __riscv_vle16_v_f16m1(r3, vl); + vfloat16m1_t _r31 = __riscv_vle16_v_f16m1(r3 + packn, vl); + vfloat16m1_t _r32 = __riscv_vle16_v_f16m1(r3 + packn * 2, vl); - _sum1 = vfmacc_vv_f16m1(_sum1, _k20, _r30, vl); - _sum1 = vfmacc_vv_f16m1(_sum1, _k21, _r31, vl); - _sum1 = vfmacc_vv_f16m1(_sum1, _k22, _r32, vl); + _sum1 = __riscv_vfmacc_vv_f16m1(_sum1, _k20, _r30, vl); + _sum1 = __riscv_vfmacc_vv_f16m1(_sum1, _k21, _r31, vl); + _sum1 = __riscv_vfmacc_vv_f16m1(_sum1, _k22, _r32, vl); - vse16_v_f16m1(outptr0, _sum0, vl); - vse16_v_f16m1(outptr1, _sum1, vl); + __riscv_vse16_v_f16m1(outptr0, _sum0, vl); + __riscv_vse16_v_f16m1(outptr1, _sum1, vl); outptr0 += packn; outptr1 += packn; @@ -210,44 +210,44 @@ static void convdw3x3s1_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, vfloat16m1_t _sum00 = _bias0; vfloat16m1_t _sum01 = _bias0; - vfloat16m1_t _r00 = vle16_v_f16m1(r0, vl); - vfloat16m1_t _r01 = vle16_v_f16m1(r0 + packn, vl); - vfloat16m1_t _r02 = vle16_v_f16m1(r0 + packn * 2, vl); - vfloat16m1_t _r03 = vle16_v_f16m1(r0 + packn * 3, vl); - - _sum00 = vfmacc_vv_f16m1(_sum00, _k00, _r00, vl); - _sum00 = vfmacc_vv_f16m1(_sum00, _k01, _r01, vl); - _sum00 = vfmacc_vv_f16m1(_sum00, _k02, _r02, vl); - _sum01 = vfmacc_vv_f16m1(_sum01, _k00, _r01, vl); - _sum01 = vfmacc_vv_f16m1(_sum01, _k01, _r02, vl); - _sum01 = vfmacc_vv_f16m1(_sum01, _k02, _r03, vl); - - vfloat16m1_t _r10 = vle16_v_f16m1(r1, vl); - vfloat16m1_t _r11 = vle16_v_f16m1(r1 + packn, vl); - vfloat16m1_t _r12 = vle16_v_f16m1(r1 + packn * 2, vl); - vfloat16m1_t _r13 = vle16_v_f16m1(r1 + packn * 3, vl); - - _sum00 = vfmacc_vv_f16m1(_sum00, _k10, _r10, vl); - _sum00 = vfmacc_vv_f16m1(_sum00, _k11, _r11, vl); - _sum00 = vfmacc_vv_f16m1(_sum00, _k12, _r12, vl); - _sum01 = vfmacc_vv_f16m1(_sum01, _k10, _r11, vl); - _sum01 = vfmacc_vv_f16m1(_sum01, _k11, _r12, vl); - _sum01 = vfmacc_vv_f16m1(_sum01, _k12, _r13, vl); - - vfloat16m1_t _r20 = vle16_v_f16m1(r2, vl); - vfloat16m1_t _r21 = vle16_v_f16m1(r2 + packn, vl); - vfloat16m1_t _r22 = vle16_v_f16m1(r2 + packn * 2, vl); - vfloat16m1_t _r23 = vle16_v_f16m1(r2 + packn * 3, vl); - - _sum00 = vfmacc_vv_f16m1(_sum00, _k20, _r20, vl); - _sum00 = vfmacc_vv_f16m1(_sum00, _k21, _r21, vl); - _sum00 = vfmacc_vv_f16m1(_sum00, _k22, _r22, vl); - _sum01 = vfmacc_vv_f16m1(_sum01, _k20, _r21, vl); - _sum01 = vfmacc_vv_f16m1(_sum01, _k21, _r22, vl); - _sum01 = vfmacc_vv_f16m1(_sum01, _k22, _r23, vl); - - vse16_v_f16m1(outptr0, _sum00, vl); - vse16_v_f16m1(outptr0 + packn, _sum01, vl); + vfloat16m1_t _r00 = __riscv_vle16_v_f16m1(r0, vl); + vfloat16m1_t _r01 = __riscv_vle16_v_f16m1(r0 + packn, vl); + vfloat16m1_t _r02 = __riscv_vle16_v_f16m1(r0 + packn * 2, vl); + vfloat16m1_t _r03 = __riscv_vle16_v_f16m1(r0 + packn * 3, vl); + + _sum00 = __riscv_vfmacc_vv_f16m1(_sum00, _k00, _r00, vl); + _sum00 = __riscv_vfmacc_vv_f16m1(_sum00, _k01, _r01, vl); + _sum00 = __riscv_vfmacc_vv_f16m1(_sum00, _k02, _r02, vl); + _sum01 = __riscv_vfmacc_vv_f16m1(_sum01, _k00, _r01, vl); + _sum01 = __riscv_vfmacc_vv_f16m1(_sum01, _k01, _r02, vl); + _sum01 = __riscv_vfmacc_vv_f16m1(_sum01, _k02, _r03, vl); + + vfloat16m1_t _r10 = __riscv_vle16_v_f16m1(r1, vl); + vfloat16m1_t _r11 = __riscv_vle16_v_f16m1(r1 + packn, vl); + vfloat16m1_t _r12 = __riscv_vle16_v_f16m1(r1 + packn * 2, vl); + vfloat16m1_t _r13 = __riscv_vle16_v_f16m1(r1 + packn * 3, vl); + + _sum00 = __riscv_vfmacc_vv_f16m1(_sum00, _k10, _r10, vl); + _sum00 = __riscv_vfmacc_vv_f16m1(_sum00, _k11, _r11, vl); + _sum00 = __riscv_vfmacc_vv_f16m1(_sum00, _k12, _r12, vl); + _sum01 = __riscv_vfmacc_vv_f16m1(_sum01, _k10, _r11, vl); + _sum01 = __riscv_vfmacc_vv_f16m1(_sum01, _k11, _r12, vl); + _sum01 = __riscv_vfmacc_vv_f16m1(_sum01, _k12, _r13, vl); + + vfloat16m1_t _r20 = __riscv_vle16_v_f16m1(r2, vl); + vfloat16m1_t _r21 = __riscv_vle16_v_f16m1(r2 + packn, vl); + vfloat16m1_t _r22 = __riscv_vle16_v_f16m1(r2 + packn * 2, vl); + vfloat16m1_t _r23 = __riscv_vle16_v_f16m1(r2 + packn * 3, vl); + + _sum00 = __riscv_vfmacc_vv_f16m1(_sum00, _k20, _r20, vl); + _sum00 = __riscv_vfmacc_vv_f16m1(_sum00, _k21, _r21, vl); + _sum00 = __riscv_vfmacc_vv_f16m1(_sum00, _k22, _r22, vl); + _sum01 = __riscv_vfmacc_vv_f16m1(_sum01, _k20, _r21, vl); + _sum01 = __riscv_vfmacc_vv_f16m1(_sum01, _k21, _r22, vl); + _sum01 = __riscv_vfmacc_vv_f16m1(_sum01, _k22, _r23, vl); + + __riscv_vse16_v_f16m1(outptr0, _sum00, vl); + __riscv_vse16_v_f16m1(outptr0 + packn, _sum01, vl); outptr0 += packn * 2; @@ -259,31 +259,31 @@ static void convdw3x3s1_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, { vfloat16m1_t _sum0 = _bias0; - vfloat16m1_t _r00 = vle16_v_f16m1(r0, vl); - vfloat16m1_t _r01 = vle16_v_f16m1(r0 + packn, vl); - vfloat16m1_t _r02 = vle16_v_f16m1(r0 + packn * 2, vl); + vfloat16m1_t _r00 = __riscv_vle16_v_f16m1(r0, vl); + vfloat16m1_t _r01 = __riscv_vle16_v_f16m1(r0 + packn, vl); + vfloat16m1_t _r02 = __riscv_vle16_v_f16m1(r0 + packn * 2, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, _k00, _r00, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, _k01, _r01, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, _k02, _r02, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k00, _r00, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k01, _r01, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k02, _r02, vl); - vfloat16m1_t _r10 = vle16_v_f16m1(r1, vl); - vfloat16m1_t _r11 = vle16_v_f16m1(r1 + packn, vl); - vfloat16m1_t _r12 = vle16_v_f16m1(r1 + packn * 2, vl); + vfloat16m1_t _r10 = __riscv_vle16_v_f16m1(r1, vl); + vfloat16m1_t _r11 = __riscv_vle16_v_f16m1(r1 + packn, vl); + vfloat16m1_t _r12 = __riscv_vle16_v_f16m1(r1 + packn * 2, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, _k10, _r10, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, _k11, _r11, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, _k12, _r12, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k10, _r10, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k11, _r11, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k12, _r12, vl); - vfloat16m1_t _r20 = vle16_v_f16m1(r2, vl); - vfloat16m1_t _r21 = vle16_v_f16m1(r2 + packn, vl); - vfloat16m1_t _r22 = vle16_v_f16m1(r2 + packn * 2, vl); + vfloat16m1_t _r20 = __riscv_vle16_v_f16m1(r2, vl); + vfloat16m1_t _r21 = __riscv_vle16_v_f16m1(r2 + packn, vl); + vfloat16m1_t _r22 = __riscv_vle16_v_f16m1(r2 + packn * 2, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, _k20, _r20, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, _k21, _r21, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, _k22, _r22, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k20, _r20, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k21, _r21, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k22, _r22, vl); - vse16_v_f16m1(outptr0, _sum0, vl); + __riscv_vse16_v_f16m1(outptr0, _sum0, vl); outptr0 += packn; @@ -302,7 +302,7 @@ static void convdw3x3s1_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, static void convdw3x3s2_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) { const int packn = csrr_vlenb() / 2; - const size_t vl = vsetvl_e16m1(packn); + const size_t vl = __riscv_vsetvl_e16m1(packn); int w = bottom_blob.w; @@ -320,7 +320,7 @@ static void convdw3x3s2_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, { Mat out = top_blob.channel(g); - vfloat16m1_t _bias0 = bias ? vle16_v_f16m1(bias + g * packn, vl) : vfmv_v_f_f16m1((__fp16)0.f, vl); + vfloat16m1_t _bias0 = bias ? __riscv_vle16_v_f16m1(bias + g * packn, vl) : __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl); const __fp16* k0 = kernel.row(g); @@ -332,15 +332,15 @@ static void convdw3x3s2_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, const __fp16* r1 = img0.row(1); const __fp16* r2 = img0.row(2); - vfloat16m1_t _k00 = vle16_v_f16m1(k0, vl); - vfloat16m1_t _k01 = vle16_v_f16m1(k0 + packn, vl); - vfloat16m1_t _k02 = vle16_v_f16m1(k0 + packn * 2, vl); - vfloat16m1_t _k10 = vle16_v_f16m1(k0 + packn * 3, vl); - vfloat16m1_t _k11 = vle16_v_f16m1(k0 + packn * 4, vl); - vfloat16m1_t _k12 = vle16_v_f16m1(k0 + packn * 5, vl); - vfloat16m1_t _k20 = vle16_v_f16m1(k0 + packn * 6, vl); - vfloat16m1_t _k21 = vle16_v_f16m1(k0 + packn * 7, vl); - vfloat16m1_t _k22 = vle16_v_f16m1(k0 + packn * 8, vl); + vfloat16m1_t _k00 = __riscv_vle16_v_f16m1(k0, vl); + vfloat16m1_t _k01 = __riscv_vle16_v_f16m1(k0 + packn, vl); + vfloat16m1_t _k02 = __riscv_vle16_v_f16m1(k0 + packn * 2, vl); + vfloat16m1_t _k10 = __riscv_vle16_v_f16m1(k0 + packn * 3, vl); + vfloat16m1_t _k11 = __riscv_vle16_v_f16m1(k0 + packn * 4, vl); + vfloat16m1_t _k12 = __riscv_vle16_v_f16m1(k0 + packn * 5, vl); + vfloat16m1_t _k20 = __riscv_vle16_v_f16m1(k0 + packn * 6, vl); + vfloat16m1_t _k21 = __riscv_vle16_v_f16m1(k0 + packn * 7, vl); + vfloat16m1_t _k22 = __riscv_vle16_v_f16m1(k0 + packn * 8, vl); int i = 0; for (; i < outh; i++) @@ -351,47 +351,47 @@ static void convdw3x3s2_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, vfloat16m1_t _sum00 = _bias0; vfloat16m1_t _sum01 = _bias0; - vfloat16m1_t _r00 = vle16_v_f16m1(r0, vl); - vfloat16m1_t _r01 = vle16_v_f16m1(r0 + packn, vl); - vfloat16m1_t _r02 = vle16_v_f16m1(r0 + packn * 2, vl); - vfloat16m1_t _r03 = vle16_v_f16m1(r0 + packn * 3, vl); - vfloat16m1_t _r04 = vle16_v_f16m1(r0 + packn * 4, vl); - - _sum00 = vfmacc_vv_f16m1(_sum00, _k00, _r00, vl); - _sum00 = vfmacc_vv_f16m1(_sum00, _k01, _r01, vl); - _sum00 = vfmacc_vv_f16m1(_sum00, _k02, _r02, vl); - _sum01 = vfmacc_vv_f16m1(_sum01, _k00, _r02, vl); - _sum01 = vfmacc_vv_f16m1(_sum01, _k01, _r03, vl); - _sum01 = vfmacc_vv_f16m1(_sum01, _k02, _r04, vl); - - vfloat16m1_t _r10 = vle16_v_f16m1(r1, vl); - vfloat16m1_t _r11 = vle16_v_f16m1(r1 + packn, vl); - vfloat16m1_t _r12 = vle16_v_f16m1(r1 + packn * 2, vl); - vfloat16m1_t _r13 = vle16_v_f16m1(r1 + packn * 3, vl); - vfloat16m1_t _r14 = vle16_v_f16m1(r1 + packn * 4, vl); - - _sum00 = vfmacc_vv_f16m1(_sum00, _k10, _r10, vl); - _sum00 = vfmacc_vv_f16m1(_sum00, _k11, _r11, vl); - _sum00 = vfmacc_vv_f16m1(_sum00, _k12, _r12, vl); - _sum01 = vfmacc_vv_f16m1(_sum01, _k10, _r12, vl); - _sum01 = vfmacc_vv_f16m1(_sum01, _k11, _r13, vl); - _sum01 = vfmacc_vv_f16m1(_sum01, _k12, _r14, vl); - - vfloat16m1_t _r20 = vle16_v_f16m1(r2, vl); - vfloat16m1_t _r21 = vle16_v_f16m1(r2 + packn, vl); - vfloat16m1_t _r22 = vle16_v_f16m1(r2 + packn * 2, vl); - vfloat16m1_t _r23 = vle16_v_f16m1(r2 + packn * 3, vl); - vfloat16m1_t _r24 = vle16_v_f16m1(r2 + packn * 4, vl); - - _sum00 = vfmacc_vv_f16m1(_sum00, _k20, _r20, vl); - _sum00 = vfmacc_vv_f16m1(_sum00, _k21, _r21, vl); - _sum00 = vfmacc_vv_f16m1(_sum00, _k22, _r22, vl); - _sum01 = vfmacc_vv_f16m1(_sum01, _k20, _r22, vl); - _sum01 = vfmacc_vv_f16m1(_sum01, _k21, _r23, vl); - _sum01 = vfmacc_vv_f16m1(_sum01, _k22, _r24, vl); - - vse16_v_f16m1(outptr0, _sum00, vl); - vse16_v_f16m1(outptr0 + packn, _sum01, vl); + vfloat16m1_t _r00 = __riscv_vle16_v_f16m1(r0, vl); + vfloat16m1_t _r01 = __riscv_vle16_v_f16m1(r0 + packn, vl); + vfloat16m1_t _r02 = __riscv_vle16_v_f16m1(r0 + packn * 2, vl); + vfloat16m1_t _r03 = __riscv_vle16_v_f16m1(r0 + packn * 3, vl); + vfloat16m1_t _r04 = __riscv_vle16_v_f16m1(r0 + packn * 4, vl); + + _sum00 = __riscv_vfmacc_vv_f16m1(_sum00, _k00, _r00, vl); + _sum00 = __riscv_vfmacc_vv_f16m1(_sum00, _k01, _r01, vl); + _sum00 = __riscv_vfmacc_vv_f16m1(_sum00, _k02, _r02, vl); + _sum01 = __riscv_vfmacc_vv_f16m1(_sum01, _k00, _r02, vl); + _sum01 = __riscv_vfmacc_vv_f16m1(_sum01, _k01, _r03, vl); + _sum01 = __riscv_vfmacc_vv_f16m1(_sum01, _k02, _r04, vl); + + vfloat16m1_t _r10 = __riscv_vle16_v_f16m1(r1, vl); + vfloat16m1_t _r11 = __riscv_vle16_v_f16m1(r1 + packn, vl); + vfloat16m1_t _r12 = __riscv_vle16_v_f16m1(r1 + packn * 2, vl); + vfloat16m1_t _r13 = __riscv_vle16_v_f16m1(r1 + packn * 3, vl); + vfloat16m1_t _r14 = __riscv_vle16_v_f16m1(r1 + packn * 4, vl); + + _sum00 = __riscv_vfmacc_vv_f16m1(_sum00, _k10, _r10, vl); + _sum00 = __riscv_vfmacc_vv_f16m1(_sum00, _k11, _r11, vl); + _sum00 = __riscv_vfmacc_vv_f16m1(_sum00, _k12, _r12, vl); + _sum01 = __riscv_vfmacc_vv_f16m1(_sum01, _k10, _r12, vl); + _sum01 = __riscv_vfmacc_vv_f16m1(_sum01, _k11, _r13, vl); + _sum01 = __riscv_vfmacc_vv_f16m1(_sum01, _k12, _r14, vl); + + vfloat16m1_t _r20 = __riscv_vle16_v_f16m1(r2, vl); + vfloat16m1_t _r21 = __riscv_vle16_v_f16m1(r2 + packn, vl); + vfloat16m1_t _r22 = __riscv_vle16_v_f16m1(r2 + packn * 2, vl); + vfloat16m1_t _r23 = __riscv_vle16_v_f16m1(r2 + packn * 3, vl); + vfloat16m1_t _r24 = __riscv_vle16_v_f16m1(r2 + packn * 4, vl); + + _sum00 = __riscv_vfmacc_vv_f16m1(_sum00, _k20, _r20, vl); + _sum00 = __riscv_vfmacc_vv_f16m1(_sum00, _k21, _r21, vl); + _sum00 = __riscv_vfmacc_vv_f16m1(_sum00, _k22, _r22, vl); + _sum01 = __riscv_vfmacc_vv_f16m1(_sum01, _k20, _r22, vl); + _sum01 = __riscv_vfmacc_vv_f16m1(_sum01, _k21, _r23, vl); + _sum01 = __riscv_vfmacc_vv_f16m1(_sum01, _k22, _r24, vl); + + __riscv_vse16_v_f16m1(outptr0, _sum00, vl); + __riscv_vse16_v_f16m1(outptr0 + packn, _sum01, vl); outptr0 += packn * 2; @@ -403,31 +403,31 @@ static void convdw3x3s2_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, { vfloat16m1_t _sum0 = _bias0; - vfloat16m1_t _r00 = vle16_v_f16m1(r0, vl); - vfloat16m1_t _r01 = vle16_v_f16m1(r0 + packn, vl); - vfloat16m1_t _r02 = vle16_v_f16m1(r0 + packn * 2, vl); + vfloat16m1_t _r00 = __riscv_vle16_v_f16m1(r0, vl); + vfloat16m1_t _r01 = __riscv_vle16_v_f16m1(r0 + packn, vl); + vfloat16m1_t _r02 = __riscv_vle16_v_f16m1(r0 + packn * 2, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, _k00, _r00, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, _k01, _r01, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, _k02, _r02, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k00, _r00, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k01, _r01, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k02, _r02, vl); - vfloat16m1_t _r10 = vle16_v_f16m1(r1, vl); - vfloat16m1_t _r11 = vle16_v_f16m1(r1 + packn, vl); - vfloat16m1_t _r12 = vle16_v_f16m1(r1 + packn * 2, vl); + vfloat16m1_t _r10 = __riscv_vle16_v_f16m1(r1, vl); + vfloat16m1_t _r11 = __riscv_vle16_v_f16m1(r1 + packn, vl); + vfloat16m1_t _r12 = __riscv_vle16_v_f16m1(r1 + packn * 2, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, _k10, _r10, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, _k11, _r11, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, _k12, _r12, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k10, _r10, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k11, _r11, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k12, _r12, vl); - vfloat16m1_t _r20 = vle16_v_f16m1(r2, vl); - vfloat16m1_t _r21 = vle16_v_f16m1(r2 + packn, vl); - vfloat16m1_t _r22 = vle16_v_f16m1(r2 + packn * 2, vl); + vfloat16m1_t _r20 = __riscv_vle16_v_f16m1(r2, vl); + vfloat16m1_t _r21 = __riscv_vle16_v_f16m1(r2 + packn, vl); + vfloat16m1_t _r22 = __riscv_vle16_v_f16m1(r2 + packn * 2, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, _k20, _r20, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, _k21, _r21, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, _k22, _r22, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k20, _r20, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k21, _r21, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k22, _r22, vl); - vse16_v_f16m1(outptr0, _sum0, vl); + __riscv_vse16_v_f16m1(outptr0, _sum0, vl); outptr0 += packn; diff --git a/src/layer/riscv/convolutiondepthwise_5x5_packn.h b/src/layer/riscv/convolutiondepthwise_5x5_packn.h index 2ef2fea7455..371d077f76e 100644 --- a/src/layer/riscv/convolutiondepthwise_5x5_packn.h +++ b/src/layer/riscv/convolutiondepthwise_5x5_packn.h @@ -15,7 +15,7 @@ static void convdw5x5s1_packn_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) { const int packn = csrr_vlenb() / 4; - const size_t vl = vsetvl_e32m1(packn); + const size_t vl = __riscv_vsetvl_e32m1(packn); int w = bottom_blob.w; @@ -31,7 +31,7 @@ static void convdw5x5s1_packn_rvv(const Mat& bottom_blob, Mat& top_blob, const M { Mat out = top_blob.channel(g); - vfloat32m1_t _bias0 = bias ? vle32_v_f32m1(bias + g * packn, vl) : vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _bias0 = bias ? __riscv_vle32_v_f32m1(bias + g * packn, vl) : __riscv_vfmv_v_f_f32m1(0.f, vl); const float* k0 = kernel.row(g); @@ -56,139 +56,139 @@ static void convdw5x5s1_packn_rvv(const Mat& bottom_blob, Mat& top_blob, const M vfloat32m1_t _sum0 = _bias0; vfloat32m1_t _sum1 = _bias0; - vfloat32m1_t _r00 = vle32_v_f32m1(r0, vl); - vfloat32m1_t _r01 = vle32_v_f32m1(r0 + packn, vl); - vfloat32m1_t _r02 = vle32_v_f32m1(r0 + packn * 2, vl); - vfloat32m1_t _r03 = vle32_v_f32m1(r0 + packn * 3, vl); - vfloat32m1_t _r04 = vle32_v_f32m1(r0 + packn * 4, vl); - - vfloat32m1_t _k00 = vle32_v_f32m1(k0, vl); - vfloat32m1_t _k01 = vle32_v_f32m1(k0 + packn, vl); - vfloat32m1_t _k02 = vle32_v_f32m1(k0 + packn * 2, vl); - vfloat32m1_t _k03 = vle32_v_f32m1(k0 + packn * 3, vl); - vfloat32m1_t _k04 = vle32_v_f32m1(k0 + packn * 4, vl); + vfloat32m1_t _r00 = __riscv_vle32_v_f32m1(r0, vl); + vfloat32m1_t _r01 = __riscv_vle32_v_f32m1(r0 + packn, vl); + vfloat32m1_t _r02 = __riscv_vle32_v_f32m1(r0 + packn * 2, vl); + vfloat32m1_t _r03 = __riscv_vle32_v_f32m1(r0 + packn * 3, vl); + vfloat32m1_t _r04 = __riscv_vle32_v_f32m1(r0 + packn * 4, vl); + + vfloat32m1_t _k00 = __riscv_vle32_v_f32m1(k0, vl); + vfloat32m1_t _k01 = __riscv_vle32_v_f32m1(k0 + packn, vl); + vfloat32m1_t _k02 = __riscv_vle32_v_f32m1(k0 + packn * 2, vl); + vfloat32m1_t _k03 = __riscv_vle32_v_f32m1(k0 + packn * 3, vl); + vfloat32m1_t _k04 = __riscv_vle32_v_f32m1(k0 + packn * 4, vl); k0 += packn * 5; - _sum0 = vfmacc_vv_f32m1(_sum0, _k00, _r00, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, _k01, _r01, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, _k02, _r02, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, _k03, _r03, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, _k04, _r04, vl); - - vfloat32m1_t _r10 = vle32_v_f32m1(r1, vl); - vfloat32m1_t _r11 = vle32_v_f32m1(r1 + packn, vl); - vfloat32m1_t _r12 = vle32_v_f32m1(r1 + packn * 2, vl); - vfloat32m1_t _r13 = vle32_v_f32m1(r1 + packn * 3, vl); - vfloat32m1_t _r14 = vle32_v_f32m1(r1 + packn * 4, vl); - - _sum1 = vfmacc_vv_f32m1(_sum1, _k00, _r10, vl); - _sum1 = vfmacc_vv_f32m1(_sum1, _k01, _r11, vl); - _sum1 = vfmacc_vv_f32m1(_sum1, _k02, _r12, vl); - _sum1 = vfmacc_vv_f32m1(_sum1, _k03, _r13, vl); - _sum1 = vfmacc_vv_f32m1(_sum1, _k04, _r14, vl); - - vfloat32m1_t _k10 = vle32_v_f32m1(k0, vl); - vfloat32m1_t _k11 = vle32_v_f32m1(k0 + packn, vl); - vfloat32m1_t _k12 = vle32_v_f32m1(k0 + packn * 2, vl); - vfloat32m1_t _k13 = vle32_v_f32m1(k0 + packn * 3, vl); - vfloat32m1_t _k14 = vle32_v_f32m1(k0 + packn * 4, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k00, _r00, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k01, _r01, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k02, _r02, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k03, _r03, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k04, _r04, vl); + + vfloat32m1_t _r10 = __riscv_vle32_v_f32m1(r1, vl); + vfloat32m1_t _r11 = __riscv_vle32_v_f32m1(r1 + packn, vl); + vfloat32m1_t _r12 = __riscv_vle32_v_f32m1(r1 + packn * 2, vl); + vfloat32m1_t _r13 = __riscv_vle32_v_f32m1(r1 + packn * 3, vl); + vfloat32m1_t _r14 = __riscv_vle32_v_f32m1(r1 + packn * 4, vl); + + _sum1 = __riscv_vfmacc_vv_f32m1(_sum1, _k00, _r10, vl); + _sum1 = __riscv_vfmacc_vv_f32m1(_sum1, _k01, _r11, vl); + _sum1 = __riscv_vfmacc_vv_f32m1(_sum1, _k02, _r12, vl); + _sum1 = __riscv_vfmacc_vv_f32m1(_sum1, _k03, _r13, vl); + _sum1 = __riscv_vfmacc_vv_f32m1(_sum1, _k04, _r14, vl); + + vfloat32m1_t _k10 = __riscv_vle32_v_f32m1(k0, vl); + vfloat32m1_t _k11 = __riscv_vle32_v_f32m1(k0 + packn, vl); + vfloat32m1_t _k12 = __riscv_vle32_v_f32m1(k0 + packn * 2, vl); + vfloat32m1_t _k13 = __riscv_vle32_v_f32m1(k0 + packn * 3, vl); + vfloat32m1_t _k14 = __riscv_vle32_v_f32m1(k0 + packn * 4, vl); k0 += packn * 5; - _sum0 = vfmacc_vv_f32m1(_sum0, _k10, _r10, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, _k11, _r11, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, _k12, _r12, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, _k13, _r13, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, _k14, _r14, vl); - - vfloat32m1_t _r20 = vle32_v_f32m1(r2, vl); - vfloat32m1_t _r21 = vle32_v_f32m1(r2 + packn, vl); - vfloat32m1_t _r22 = vle32_v_f32m1(r2 + packn * 2, vl); - vfloat32m1_t _r23 = vle32_v_f32m1(r2 + packn * 3, vl); - vfloat32m1_t _r24 = vle32_v_f32m1(r2 + packn * 4, vl); - - _sum1 = vfmacc_vv_f32m1(_sum1, _k10, _r20, vl); - _sum1 = vfmacc_vv_f32m1(_sum1, _k11, _r21, vl); - _sum1 = vfmacc_vv_f32m1(_sum1, _k12, _r22, vl); - _sum1 = vfmacc_vv_f32m1(_sum1, _k13, _r23, vl); - _sum1 = vfmacc_vv_f32m1(_sum1, _k14, _r24, vl); - - vfloat32m1_t _k20 = vle32_v_f32m1(k0, vl); - vfloat32m1_t _k21 = vle32_v_f32m1(k0 + packn, vl); - vfloat32m1_t _k22 = vle32_v_f32m1(k0 + packn * 2, vl); - vfloat32m1_t _k23 = vle32_v_f32m1(k0 + packn * 3, vl); - vfloat32m1_t _k24 = vle32_v_f32m1(k0 + packn * 4, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k10, _r10, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k11, _r11, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k12, _r12, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k13, _r13, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k14, _r14, vl); + + vfloat32m1_t _r20 = __riscv_vle32_v_f32m1(r2, vl); + vfloat32m1_t _r21 = __riscv_vle32_v_f32m1(r2 + packn, vl); + vfloat32m1_t _r22 = __riscv_vle32_v_f32m1(r2 + packn * 2, vl); + vfloat32m1_t _r23 = __riscv_vle32_v_f32m1(r2 + packn * 3, vl); + vfloat32m1_t _r24 = __riscv_vle32_v_f32m1(r2 + packn * 4, vl); + + _sum1 = __riscv_vfmacc_vv_f32m1(_sum1, _k10, _r20, vl); + _sum1 = __riscv_vfmacc_vv_f32m1(_sum1, _k11, _r21, vl); + _sum1 = __riscv_vfmacc_vv_f32m1(_sum1, _k12, _r22, vl); + _sum1 = __riscv_vfmacc_vv_f32m1(_sum1, _k13, _r23, vl); + _sum1 = __riscv_vfmacc_vv_f32m1(_sum1, _k14, _r24, vl); + + vfloat32m1_t _k20 = __riscv_vle32_v_f32m1(k0, vl); + vfloat32m1_t _k21 = __riscv_vle32_v_f32m1(k0 + packn, vl); + vfloat32m1_t _k22 = __riscv_vle32_v_f32m1(k0 + packn * 2, vl); + vfloat32m1_t _k23 = __riscv_vle32_v_f32m1(k0 + packn * 3, vl); + vfloat32m1_t _k24 = __riscv_vle32_v_f32m1(k0 + packn * 4, vl); k0 += packn * 5; - _sum0 = vfmacc_vv_f32m1(_sum0, _k20, _r20, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, _k21, _r21, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, _k22, _r22, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, _k23, _r23, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, _k24, _r24, vl); - - vfloat32m1_t _r30 = vle32_v_f32m1(r3, vl); - vfloat32m1_t _r31 = vle32_v_f32m1(r3 + packn, vl); - vfloat32m1_t _r32 = vle32_v_f32m1(r3 + packn * 2, vl); - vfloat32m1_t _r33 = vle32_v_f32m1(r3 + packn * 3, vl); - vfloat32m1_t _r34 = vle32_v_f32m1(r3 + packn * 4, vl); - - _sum1 = vfmacc_vv_f32m1(_sum1, _k20, _r30, vl); - _sum1 = vfmacc_vv_f32m1(_sum1, _k21, _r31, vl); - _sum1 = vfmacc_vv_f32m1(_sum1, _k22, _r32, vl); - _sum1 = vfmacc_vv_f32m1(_sum1, _k23, _r33, vl); - _sum1 = vfmacc_vv_f32m1(_sum1, _k24, _r34, vl); - - vfloat32m1_t _k30 = vle32_v_f32m1(k0, vl); - vfloat32m1_t _k31 = vle32_v_f32m1(k0 + packn, vl); - vfloat32m1_t _k32 = vle32_v_f32m1(k0 + packn * 2, vl); - vfloat32m1_t _k33 = vle32_v_f32m1(k0 + packn * 3, vl); - vfloat32m1_t _k34 = vle32_v_f32m1(k0 + packn * 4, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k20, _r20, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k21, _r21, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k22, _r22, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k23, _r23, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k24, _r24, vl); + + vfloat32m1_t _r30 = __riscv_vle32_v_f32m1(r3, vl); + vfloat32m1_t _r31 = __riscv_vle32_v_f32m1(r3 + packn, vl); + vfloat32m1_t _r32 = __riscv_vle32_v_f32m1(r3 + packn * 2, vl); + vfloat32m1_t _r33 = __riscv_vle32_v_f32m1(r3 + packn * 3, vl); + vfloat32m1_t _r34 = __riscv_vle32_v_f32m1(r3 + packn * 4, vl); + + _sum1 = __riscv_vfmacc_vv_f32m1(_sum1, _k20, _r30, vl); + _sum1 = __riscv_vfmacc_vv_f32m1(_sum1, _k21, _r31, vl); + _sum1 = __riscv_vfmacc_vv_f32m1(_sum1, _k22, _r32, vl); + _sum1 = __riscv_vfmacc_vv_f32m1(_sum1, _k23, _r33, vl); + _sum1 = __riscv_vfmacc_vv_f32m1(_sum1, _k24, _r34, vl); + + vfloat32m1_t _k30 = __riscv_vle32_v_f32m1(k0, vl); + vfloat32m1_t _k31 = __riscv_vle32_v_f32m1(k0 + packn, vl); + vfloat32m1_t _k32 = __riscv_vle32_v_f32m1(k0 + packn * 2, vl); + vfloat32m1_t _k33 = __riscv_vle32_v_f32m1(k0 + packn * 3, vl); + vfloat32m1_t _k34 = __riscv_vle32_v_f32m1(k0 + packn * 4, vl); k0 += packn * 5; - _sum0 = vfmacc_vv_f32m1(_sum0, _k30, _r30, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, _k31, _r31, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, _k32, _r32, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, _k33, _r33, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, _k34, _r34, vl); - - vfloat32m1_t _r40 = vle32_v_f32m1(r4, vl); - vfloat32m1_t _r41 = vle32_v_f32m1(r4 + packn, vl); - vfloat32m1_t _r42 = vle32_v_f32m1(r4 + packn * 2, vl); - vfloat32m1_t _r43 = vle32_v_f32m1(r4 + packn * 3, vl); - vfloat32m1_t _r44 = vle32_v_f32m1(r4 + packn * 4, vl); - - _sum1 = vfmacc_vv_f32m1(_sum1, _k30, _r40, vl); - _sum1 = vfmacc_vv_f32m1(_sum1, _k31, _r41, vl); - _sum1 = vfmacc_vv_f32m1(_sum1, _k32, _r42, vl); - _sum1 = vfmacc_vv_f32m1(_sum1, _k33, _r43, vl); - _sum1 = vfmacc_vv_f32m1(_sum1, _k34, _r44, vl); - - vfloat32m1_t _k40 = vle32_v_f32m1(k0, vl); - vfloat32m1_t _k41 = vle32_v_f32m1(k0 + packn, vl); - vfloat32m1_t _k42 = vle32_v_f32m1(k0 + packn * 2, vl); - vfloat32m1_t _k43 = vle32_v_f32m1(k0 + packn * 3, vl); - vfloat32m1_t _k44 = vle32_v_f32m1(k0 + packn * 4, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k30, _r30, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k31, _r31, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k32, _r32, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k33, _r33, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k34, _r34, vl); + + vfloat32m1_t _r40 = __riscv_vle32_v_f32m1(r4, vl); + vfloat32m1_t _r41 = __riscv_vle32_v_f32m1(r4 + packn, vl); + vfloat32m1_t _r42 = __riscv_vle32_v_f32m1(r4 + packn * 2, vl); + vfloat32m1_t _r43 = __riscv_vle32_v_f32m1(r4 + packn * 3, vl); + vfloat32m1_t _r44 = __riscv_vle32_v_f32m1(r4 + packn * 4, vl); + + _sum1 = __riscv_vfmacc_vv_f32m1(_sum1, _k30, _r40, vl); + _sum1 = __riscv_vfmacc_vv_f32m1(_sum1, _k31, _r41, vl); + _sum1 = __riscv_vfmacc_vv_f32m1(_sum1, _k32, _r42, vl); + _sum1 = __riscv_vfmacc_vv_f32m1(_sum1, _k33, _r43, vl); + _sum1 = __riscv_vfmacc_vv_f32m1(_sum1, _k34, _r44, vl); + + vfloat32m1_t _k40 = __riscv_vle32_v_f32m1(k0, vl); + vfloat32m1_t _k41 = __riscv_vle32_v_f32m1(k0 + packn, vl); + vfloat32m1_t _k42 = __riscv_vle32_v_f32m1(k0 + packn * 2, vl); + vfloat32m1_t _k43 = __riscv_vle32_v_f32m1(k0 + packn * 3, vl); + vfloat32m1_t _k44 = __riscv_vle32_v_f32m1(k0 + packn * 4, vl); k0 -= packn * 20; - _sum0 = vfmacc_vv_f32m1(_sum0, _k40, _r40, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, _k41, _r41, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, _k42, _r42, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, _k43, _r43, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, _k44, _r44, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k40, _r40, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k41, _r41, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k42, _r42, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k43, _r43, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k44, _r44, vl); - vfloat32m1_t _r50 = vle32_v_f32m1(r5, vl); - vfloat32m1_t _r51 = vle32_v_f32m1(r5 + packn, vl); - vfloat32m1_t _r52 = vle32_v_f32m1(r5 + packn * 2, vl); - vfloat32m1_t _r53 = vle32_v_f32m1(r5 + packn * 3, vl); - vfloat32m1_t _r54 = vle32_v_f32m1(r5 + packn * 4, vl); + vfloat32m1_t _r50 = __riscv_vle32_v_f32m1(r5, vl); + vfloat32m1_t _r51 = __riscv_vle32_v_f32m1(r5 + packn, vl); + vfloat32m1_t _r52 = __riscv_vle32_v_f32m1(r5 + packn * 2, vl); + vfloat32m1_t _r53 = __riscv_vle32_v_f32m1(r5 + packn * 3, vl); + vfloat32m1_t _r54 = __riscv_vle32_v_f32m1(r5 + packn * 4, vl); - _sum1 = vfmacc_vv_f32m1(_sum1, _k40, _r50, vl); - _sum1 = vfmacc_vv_f32m1(_sum1, _k41, _r51, vl); - _sum1 = vfmacc_vv_f32m1(_sum1, _k42, _r52, vl); - _sum1 = vfmacc_vv_f32m1(_sum1, _k43, _r53, vl); - _sum1 = vfmacc_vv_f32m1(_sum1, _k44, _r54, vl); + _sum1 = __riscv_vfmacc_vv_f32m1(_sum1, _k40, _r50, vl); + _sum1 = __riscv_vfmacc_vv_f32m1(_sum1, _k41, _r51, vl); + _sum1 = __riscv_vfmacc_vv_f32m1(_sum1, _k42, _r52, vl); + _sum1 = __riscv_vfmacc_vv_f32m1(_sum1, _k43, _r53, vl); + _sum1 = __riscv_vfmacc_vv_f32m1(_sum1, _k44, _r54, vl); - vse32_v_f32m1(outptr0, _sum0, vl); - vse32_v_f32m1(outptr1, _sum1, vl); + __riscv_vse32_v_f32m1(outptr0, _sum0, vl); + __riscv_vse32_v_f32m1(outptr1, _sum1, vl); outptr0 += packn; outptr1 += packn; @@ -218,102 +218,102 @@ static void convdw5x5s1_packn_rvv(const Mat& bottom_blob, Mat& top_blob, const M { vfloat32m1_t _sum0 = _bias0; - vfloat32m1_t _r00 = vle32_v_f32m1(r0, vl); - vfloat32m1_t _r01 = vle32_v_f32m1(r0 + packn, vl); - vfloat32m1_t _r02 = vle32_v_f32m1(r0 + packn * 2, vl); - vfloat32m1_t _r03 = vle32_v_f32m1(r0 + packn * 3, vl); - vfloat32m1_t _r04 = vle32_v_f32m1(r0 + packn * 4, vl); - - vfloat32m1_t _k00 = vle32_v_f32m1(k0, vl); - vfloat32m1_t _k01 = vle32_v_f32m1(k0 + packn, vl); - vfloat32m1_t _k02 = vle32_v_f32m1(k0 + packn * 2, vl); - vfloat32m1_t _k03 = vle32_v_f32m1(k0 + packn * 3, vl); - vfloat32m1_t _k04 = vle32_v_f32m1(k0 + packn * 4, vl); + vfloat32m1_t _r00 = __riscv_vle32_v_f32m1(r0, vl); + vfloat32m1_t _r01 = __riscv_vle32_v_f32m1(r0 + packn, vl); + vfloat32m1_t _r02 = __riscv_vle32_v_f32m1(r0 + packn * 2, vl); + vfloat32m1_t _r03 = __riscv_vle32_v_f32m1(r0 + packn * 3, vl); + vfloat32m1_t _r04 = __riscv_vle32_v_f32m1(r0 + packn * 4, vl); + + vfloat32m1_t _k00 = __riscv_vle32_v_f32m1(k0, vl); + vfloat32m1_t _k01 = __riscv_vle32_v_f32m1(k0 + packn, vl); + vfloat32m1_t _k02 = __riscv_vle32_v_f32m1(k0 + packn * 2, vl); + vfloat32m1_t _k03 = __riscv_vle32_v_f32m1(k0 + packn * 3, vl); + vfloat32m1_t _k04 = __riscv_vle32_v_f32m1(k0 + packn * 4, vl); k0 += packn * 5; - _sum0 = vfmacc_vv_f32m1(_sum0, _k00, _r00, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, _k01, _r01, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, _k02, _r02, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, _k03, _r03, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, _k04, _r04, vl); - - vfloat32m1_t _r10 = vle32_v_f32m1(r1, vl); - vfloat32m1_t _r11 = vle32_v_f32m1(r1 + packn, vl); - vfloat32m1_t _r12 = vle32_v_f32m1(r1 + packn * 2, vl); - vfloat32m1_t _r13 = vle32_v_f32m1(r1 + packn * 3, vl); - vfloat32m1_t _r14 = vle32_v_f32m1(r1 + packn * 4, vl); - - vfloat32m1_t _k10 = vle32_v_f32m1(k0, vl); - vfloat32m1_t _k11 = vle32_v_f32m1(k0 + packn, vl); - vfloat32m1_t _k12 = vle32_v_f32m1(k0 + packn * 2, vl); - vfloat32m1_t _k13 = vle32_v_f32m1(k0 + packn * 3, vl); - vfloat32m1_t _k14 = vle32_v_f32m1(k0 + packn * 4, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k00, _r00, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k01, _r01, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k02, _r02, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k03, _r03, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k04, _r04, vl); + + vfloat32m1_t _r10 = __riscv_vle32_v_f32m1(r1, vl); + vfloat32m1_t _r11 = __riscv_vle32_v_f32m1(r1 + packn, vl); + vfloat32m1_t _r12 = __riscv_vle32_v_f32m1(r1 + packn * 2, vl); + vfloat32m1_t _r13 = __riscv_vle32_v_f32m1(r1 + packn * 3, vl); + vfloat32m1_t _r14 = __riscv_vle32_v_f32m1(r1 + packn * 4, vl); + + vfloat32m1_t _k10 = __riscv_vle32_v_f32m1(k0, vl); + vfloat32m1_t _k11 = __riscv_vle32_v_f32m1(k0 + packn, vl); + vfloat32m1_t _k12 = __riscv_vle32_v_f32m1(k0 + packn * 2, vl); + vfloat32m1_t _k13 = __riscv_vle32_v_f32m1(k0 + packn * 3, vl); + vfloat32m1_t _k14 = __riscv_vle32_v_f32m1(k0 + packn * 4, vl); k0 += packn * 5; - _sum0 = vfmacc_vv_f32m1(_sum0, _k10, _r10, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, _k11, _r11, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, _k12, _r12, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, _k13, _r13, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, _k14, _r14, vl); - - vfloat32m1_t _r20 = vle32_v_f32m1(r2, vl); - vfloat32m1_t _r21 = vle32_v_f32m1(r2 + packn, vl); - vfloat32m1_t _r22 = vle32_v_f32m1(r2 + packn * 2, vl); - vfloat32m1_t _r23 = vle32_v_f32m1(r2 + packn * 3, vl); - vfloat32m1_t _r24 = vle32_v_f32m1(r2 + packn * 4, vl); - - vfloat32m1_t _k20 = vle32_v_f32m1(k0, vl); - vfloat32m1_t _k21 = vle32_v_f32m1(k0 + packn, vl); - vfloat32m1_t _k22 = vle32_v_f32m1(k0 + packn * 2, vl); - vfloat32m1_t _k23 = vle32_v_f32m1(k0 + packn * 3, vl); - vfloat32m1_t _k24 = vle32_v_f32m1(k0 + packn * 4, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k10, _r10, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k11, _r11, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k12, _r12, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k13, _r13, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k14, _r14, vl); + + vfloat32m1_t _r20 = __riscv_vle32_v_f32m1(r2, vl); + vfloat32m1_t _r21 = __riscv_vle32_v_f32m1(r2 + packn, vl); + vfloat32m1_t _r22 = __riscv_vle32_v_f32m1(r2 + packn * 2, vl); + vfloat32m1_t _r23 = __riscv_vle32_v_f32m1(r2 + packn * 3, vl); + vfloat32m1_t _r24 = __riscv_vle32_v_f32m1(r2 + packn * 4, vl); + + vfloat32m1_t _k20 = __riscv_vle32_v_f32m1(k0, vl); + vfloat32m1_t _k21 = __riscv_vle32_v_f32m1(k0 + packn, vl); + vfloat32m1_t _k22 = __riscv_vle32_v_f32m1(k0 + packn * 2, vl); + vfloat32m1_t _k23 = __riscv_vle32_v_f32m1(k0 + packn * 3, vl); + vfloat32m1_t _k24 = __riscv_vle32_v_f32m1(k0 + packn * 4, vl); k0 += packn * 5; - _sum0 = vfmacc_vv_f32m1(_sum0, _k20, _r20, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, _k21, _r21, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, _k22, _r22, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, _k23, _r23, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, _k24, _r24, vl); - - vfloat32m1_t _r30 = vle32_v_f32m1(r3, vl); - vfloat32m1_t _r31 = vle32_v_f32m1(r3 + packn, vl); - vfloat32m1_t _r32 = vle32_v_f32m1(r3 + packn * 2, vl); - vfloat32m1_t _r33 = vle32_v_f32m1(r3 + packn * 3, vl); - vfloat32m1_t _r34 = vle32_v_f32m1(r3 + packn * 4, vl); - - vfloat32m1_t _k30 = vle32_v_f32m1(k0, vl); - vfloat32m1_t _k31 = vle32_v_f32m1(k0 + packn, vl); - vfloat32m1_t _k32 = vle32_v_f32m1(k0 + packn * 2, vl); - vfloat32m1_t _k33 = vle32_v_f32m1(k0 + packn * 3, vl); - vfloat32m1_t _k34 = vle32_v_f32m1(k0 + packn * 4, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k20, _r20, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k21, _r21, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k22, _r22, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k23, _r23, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k24, _r24, vl); + + vfloat32m1_t _r30 = __riscv_vle32_v_f32m1(r3, vl); + vfloat32m1_t _r31 = __riscv_vle32_v_f32m1(r3 + packn, vl); + vfloat32m1_t _r32 = __riscv_vle32_v_f32m1(r3 + packn * 2, vl); + vfloat32m1_t _r33 = __riscv_vle32_v_f32m1(r3 + packn * 3, vl); + vfloat32m1_t _r34 = __riscv_vle32_v_f32m1(r3 + packn * 4, vl); + + vfloat32m1_t _k30 = __riscv_vle32_v_f32m1(k0, vl); + vfloat32m1_t _k31 = __riscv_vle32_v_f32m1(k0 + packn, vl); + vfloat32m1_t _k32 = __riscv_vle32_v_f32m1(k0 + packn * 2, vl); + vfloat32m1_t _k33 = __riscv_vle32_v_f32m1(k0 + packn * 3, vl); + vfloat32m1_t _k34 = __riscv_vle32_v_f32m1(k0 + packn * 4, vl); k0 += packn * 5; - _sum0 = vfmacc_vv_f32m1(_sum0, _k30, _r30, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, _k31, _r31, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, _k32, _r32, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, _k33, _r33, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, _k34, _r34, vl); - - vfloat32m1_t _r40 = vle32_v_f32m1(r4, vl); - vfloat32m1_t _r41 = vle32_v_f32m1(r4 + packn, vl); - vfloat32m1_t _r42 = vle32_v_f32m1(r4 + packn * 2, vl); - vfloat32m1_t _r43 = vle32_v_f32m1(r4 + packn * 3, vl); - vfloat32m1_t _r44 = vle32_v_f32m1(r4 + packn * 4, vl); - - vfloat32m1_t _k40 = vle32_v_f32m1(k0, vl); - vfloat32m1_t _k41 = vle32_v_f32m1(k0 + packn, vl); - vfloat32m1_t _k42 = vle32_v_f32m1(k0 + packn * 2, vl); - vfloat32m1_t _k43 = vle32_v_f32m1(k0 + packn * 3, vl); - vfloat32m1_t _k44 = vle32_v_f32m1(k0 + packn * 4, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k30, _r30, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k31, _r31, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k32, _r32, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k33, _r33, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k34, _r34, vl); + + vfloat32m1_t _r40 = __riscv_vle32_v_f32m1(r4, vl); + vfloat32m1_t _r41 = __riscv_vle32_v_f32m1(r4 + packn, vl); + vfloat32m1_t _r42 = __riscv_vle32_v_f32m1(r4 + packn * 2, vl); + vfloat32m1_t _r43 = __riscv_vle32_v_f32m1(r4 + packn * 3, vl); + vfloat32m1_t _r44 = __riscv_vle32_v_f32m1(r4 + packn * 4, vl); + + vfloat32m1_t _k40 = __riscv_vle32_v_f32m1(k0, vl); + vfloat32m1_t _k41 = __riscv_vle32_v_f32m1(k0 + packn, vl); + vfloat32m1_t _k42 = __riscv_vle32_v_f32m1(k0 + packn * 2, vl); + vfloat32m1_t _k43 = __riscv_vle32_v_f32m1(k0 + packn * 3, vl); + vfloat32m1_t _k44 = __riscv_vle32_v_f32m1(k0 + packn * 4, vl); k0 -= packn * 20; - _sum0 = vfmacc_vv_f32m1(_sum0, _k40, _r40, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, _k41, _r41, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, _k42, _r42, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, _k43, _r43, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, _k44, _r44, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k40, _r40, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k41, _r41, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k42, _r42, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k43, _r43, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k44, _r44, vl); - vse32_v_f32m1(outptr0, _sum0, vl); + __riscv_vse32_v_f32m1(outptr0, _sum0, vl); outptr0 += packn; @@ -336,7 +336,7 @@ static void convdw5x5s1_packn_rvv(const Mat& bottom_blob, Mat& top_blob, const M static void convdw5x5s2_packn_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) { const int packn = csrr_vlenb() / 4; - const size_t vl = vsetvl_e32m1(packn); + const size_t vl = __riscv_vsetvl_e32m1(packn); int w = bottom_blob.w; @@ -354,7 +354,7 @@ static void convdw5x5s2_packn_rvv(const Mat& bottom_blob, Mat& top_blob, const M { Mat out = top_blob.channel(g); - vfloat32m1_t _bias0 = bias ? vle32_v_f32m1(bias + g * packn, vl) : vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _bias0 = bias ? __riscv_vle32_v_f32m1(bias + g * packn, vl) : __riscv_vfmv_v_f_f32m1(0.f, vl); const float* k0 = kernel.row(g); @@ -376,102 +376,102 @@ static void convdw5x5s2_packn_rvv(const Mat& bottom_blob, Mat& top_blob, const M { vfloat32m1_t _sum0 = _bias0; - vfloat32m1_t _r00 = vle32_v_f32m1(r0, vl); - vfloat32m1_t _r01 = vle32_v_f32m1(r0 + packn, vl); - vfloat32m1_t _r02 = vle32_v_f32m1(r0 + packn * 2, vl); - vfloat32m1_t _r03 = vle32_v_f32m1(r0 + packn * 3, vl); - vfloat32m1_t _r04 = vle32_v_f32m1(r0 + packn * 4, vl); - - vfloat32m1_t _k00 = vle32_v_f32m1(k0, vl); - vfloat32m1_t _k01 = vle32_v_f32m1(k0 + packn, vl); - vfloat32m1_t _k02 = vle32_v_f32m1(k0 + packn * 2, vl); - vfloat32m1_t _k03 = vle32_v_f32m1(k0 + packn * 3, vl); - vfloat32m1_t _k04 = vle32_v_f32m1(k0 + packn * 4, vl); + vfloat32m1_t _r00 = __riscv_vle32_v_f32m1(r0, vl); + vfloat32m1_t _r01 = __riscv_vle32_v_f32m1(r0 + packn, vl); + vfloat32m1_t _r02 = __riscv_vle32_v_f32m1(r0 + packn * 2, vl); + vfloat32m1_t _r03 = __riscv_vle32_v_f32m1(r0 + packn * 3, vl); + vfloat32m1_t _r04 = __riscv_vle32_v_f32m1(r0 + packn * 4, vl); + + vfloat32m1_t _k00 = __riscv_vle32_v_f32m1(k0, vl); + vfloat32m1_t _k01 = __riscv_vle32_v_f32m1(k0 + packn, vl); + vfloat32m1_t _k02 = __riscv_vle32_v_f32m1(k0 + packn * 2, vl); + vfloat32m1_t _k03 = __riscv_vle32_v_f32m1(k0 + packn * 3, vl); + vfloat32m1_t _k04 = __riscv_vle32_v_f32m1(k0 + packn * 4, vl); k0 += packn * 5; - _sum0 = vfmacc_vv_f32m1(_sum0, _k00, _r00, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, _k01, _r01, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, _k02, _r02, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, _k03, _r03, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, _k04, _r04, vl); - - vfloat32m1_t _r10 = vle32_v_f32m1(r1, vl); - vfloat32m1_t _r11 = vle32_v_f32m1(r1 + packn, vl); - vfloat32m1_t _r12 = vle32_v_f32m1(r1 + packn * 2, vl); - vfloat32m1_t _r13 = vle32_v_f32m1(r1 + packn * 3, vl); - vfloat32m1_t _r14 = vle32_v_f32m1(r1 + packn * 4, vl); - - vfloat32m1_t _k10 = vle32_v_f32m1(k0, vl); - vfloat32m1_t _k11 = vle32_v_f32m1(k0 + packn, vl); - vfloat32m1_t _k12 = vle32_v_f32m1(k0 + packn * 2, vl); - vfloat32m1_t _k13 = vle32_v_f32m1(k0 + packn * 3, vl); - vfloat32m1_t _k14 = vle32_v_f32m1(k0 + packn * 4, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k00, _r00, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k01, _r01, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k02, _r02, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k03, _r03, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k04, _r04, vl); + + vfloat32m1_t _r10 = __riscv_vle32_v_f32m1(r1, vl); + vfloat32m1_t _r11 = __riscv_vle32_v_f32m1(r1 + packn, vl); + vfloat32m1_t _r12 = __riscv_vle32_v_f32m1(r1 + packn * 2, vl); + vfloat32m1_t _r13 = __riscv_vle32_v_f32m1(r1 + packn * 3, vl); + vfloat32m1_t _r14 = __riscv_vle32_v_f32m1(r1 + packn * 4, vl); + + vfloat32m1_t _k10 = __riscv_vle32_v_f32m1(k0, vl); + vfloat32m1_t _k11 = __riscv_vle32_v_f32m1(k0 + packn, vl); + vfloat32m1_t _k12 = __riscv_vle32_v_f32m1(k0 + packn * 2, vl); + vfloat32m1_t _k13 = __riscv_vle32_v_f32m1(k0 + packn * 3, vl); + vfloat32m1_t _k14 = __riscv_vle32_v_f32m1(k0 + packn * 4, vl); k0 += packn * 5; - _sum0 = vfmacc_vv_f32m1(_sum0, _k10, _r10, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, _k11, _r11, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, _k12, _r12, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, _k13, _r13, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, _k14, _r14, vl); - - vfloat32m1_t _r20 = vle32_v_f32m1(r2, vl); - vfloat32m1_t _r21 = vle32_v_f32m1(r2 + packn, vl); - vfloat32m1_t _r22 = vle32_v_f32m1(r2 + packn * 2, vl); - vfloat32m1_t _r23 = vle32_v_f32m1(r2 + packn * 3, vl); - vfloat32m1_t _r24 = vle32_v_f32m1(r2 + packn * 4, vl); - - vfloat32m1_t _k20 = vle32_v_f32m1(k0, vl); - vfloat32m1_t _k21 = vle32_v_f32m1(k0 + packn, vl); - vfloat32m1_t _k22 = vle32_v_f32m1(k0 + packn * 2, vl); - vfloat32m1_t _k23 = vle32_v_f32m1(k0 + packn * 3, vl); - vfloat32m1_t _k24 = vle32_v_f32m1(k0 + packn * 4, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k10, _r10, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k11, _r11, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k12, _r12, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k13, _r13, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k14, _r14, vl); + + vfloat32m1_t _r20 = __riscv_vle32_v_f32m1(r2, vl); + vfloat32m1_t _r21 = __riscv_vle32_v_f32m1(r2 + packn, vl); + vfloat32m1_t _r22 = __riscv_vle32_v_f32m1(r2 + packn * 2, vl); + vfloat32m1_t _r23 = __riscv_vle32_v_f32m1(r2 + packn * 3, vl); + vfloat32m1_t _r24 = __riscv_vle32_v_f32m1(r2 + packn * 4, vl); + + vfloat32m1_t _k20 = __riscv_vle32_v_f32m1(k0, vl); + vfloat32m1_t _k21 = __riscv_vle32_v_f32m1(k0 + packn, vl); + vfloat32m1_t _k22 = __riscv_vle32_v_f32m1(k0 + packn * 2, vl); + vfloat32m1_t _k23 = __riscv_vle32_v_f32m1(k0 + packn * 3, vl); + vfloat32m1_t _k24 = __riscv_vle32_v_f32m1(k0 + packn * 4, vl); k0 += packn * 5; - _sum0 = vfmacc_vv_f32m1(_sum0, _k20, _r20, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, _k21, _r21, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, _k22, _r22, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, _k23, _r23, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, _k24, _r24, vl); - - vfloat32m1_t _r30 = vle32_v_f32m1(r3, vl); - vfloat32m1_t _r31 = vle32_v_f32m1(r3 + packn, vl); - vfloat32m1_t _r32 = vle32_v_f32m1(r3 + packn * 2, vl); - vfloat32m1_t _r33 = vle32_v_f32m1(r3 + packn * 3, vl); - vfloat32m1_t _r34 = vle32_v_f32m1(r3 + packn * 4, vl); - - vfloat32m1_t _k30 = vle32_v_f32m1(k0, vl); - vfloat32m1_t _k31 = vle32_v_f32m1(k0 + packn, vl); - vfloat32m1_t _k32 = vle32_v_f32m1(k0 + packn * 2, vl); - vfloat32m1_t _k33 = vle32_v_f32m1(k0 + packn * 3, vl); - vfloat32m1_t _k34 = vle32_v_f32m1(k0 + packn * 4, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k20, _r20, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k21, _r21, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k22, _r22, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k23, _r23, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k24, _r24, vl); + + vfloat32m1_t _r30 = __riscv_vle32_v_f32m1(r3, vl); + vfloat32m1_t _r31 = __riscv_vle32_v_f32m1(r3 + packn, vl); + vfloat32m1_t _r32 = __riscv_vle32_v_f32m1(r3 + packn * 2, vl); + vfloat32m1_t _r33 = __riscv_vle32_v_f32m1(r3 + packn * 3, vl); + vfloat32m1_t _r34 = __riscv_vle32_v_f32m1(r3 + packn * 4, vl); + + vfloat32m1_t _k30 = __riscv_vle32_v_f32m1(k0, vl); + vfloat32m1_t _k31 = __riscv_vle32_v_f32m1(k0 + packn, vl); + vfloat32m1_t _k32 = __riscv_vle32_v_f32m1(k0 + packn * 2, vl); + vfloat32m1_t _k33 = __riscv_vle32_v_f32m1(k0 + packn * 3, vl); + vfloat32m1_t _k34 = __riscv_vle32_v_f32m1(k0 + packn * 4, vl); k0 += packn * 5; - _sum0 = vfmacc_vv_f32m1(_sum0, _k30, _r30, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, _k31, _r31, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, _k32, _r32, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, _k33, _r33, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, _k34, _r34, vl); - - vfloat32m1_t _r40 = vle32_v_f32m1(r4, vl); - vfloat32m1_t _r41 = vle32_v_f32m1(r4 + packn, vl); - vfloat32m1_t _r42 = vle32_v_f32m1(r4 + packn * 2, vl); - vfloat32m1_t _r43 = vle32_v_f32m1(r4 + packn * 3, vl); - vfloat32m1_t _r44 = vle32_v_f32m1(r4 + packn * 4, vl); - - vfloat32m1_t _k40 = vle32_v_f32m1(k0, vl); - vfloat32m1_t _k41 = vle32_v_f32m1(k0 + packn, vl); - vfloat32m1_t _k42 = vle32_v_f32m1(k0 + packn * 2, vl); - vfloat32m1_t _k43 = vle32_v_f32m1(k0 + packn * 3, vl); - vfloat32m1_t _k44 = vle32_v_f32m1(k0 + packn * 4, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k30, _r30, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k31, _r31, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k32, _r32, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k33, _r33, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k34, _r34, vl); + + vfloat32m1_t _r40 = __riscv_vle32_v_f32m1(r4, vl); + vfloat32m1_t _r41 = __riscv_vle32_v_f32m1(r4 + packn, vl); + vfloat32m1_t _r42 = __riscv_vle32_v_f32m1(r4 + packn * 2, vl); + vfloat32m1_t _r43 = __riscv_vle32_v_f32m1(r4 + packn * 3, vl); + vfloat32m1_t _r44 = __riscv_vle32_v_f32m1(r4 + packn * 4, vl); + + vfloat32m1_t _k40 = __riscv_vle32_v_f32m1(k0, vl); + vfloat32m1_t _k41 = __riscv_vle32_v_f32m1(k0 + packn, vl); + vfloat32m1_t _k42 = __riscv_vle32_v_f32m1(k0 + packn * 2, vl); + vfloat32m1_t _k43 = __riscv_vle32_v_f32m1(k0 + packn * 3, vl); + vfloat32m1_t _k44 = __riscv_vle32_v_f32m1(k0 + packn * 4, vl); k0 -= packn * 20; - _sum0 = vfmacc_vv_f32m1(_sum0, _k40, _r40, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, _k41, _r41, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, _k42, _r42, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, _k43, _r43, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, _k44, _r44, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k40, _r40, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k41, _r41, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k42, _r42, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k43, _r43, vl); + _sum0 = __riscv_vfmacc_vv_f32m1(_sum0, _k44, _r44, vl); - vse32_v_f32m1(outptr0, _sum0, vl); + __riscv_vse32_v_f32m1(outptr0, _sum0, vl); outptr0 += packn; diff --git a/src/layer/riscv/convolutiondepthwise_5x5_packn_fp16s.h b/src/layer/riscv/convolutiondepthwise_5x5_packn_fp16s.h index 08270e307c9..8169d842983 100644 --- a/src/layer/riscv/convolutiondepthwise_5x5_packn_fp16s.h +++ b/src/layer/riscv/convolutiondepthwise_5x5_packn_fp16s.h @@ -15,7 +15,7 @@ static void convdw5x5s1_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) { const int packn = csrr_vlenb() / 2; - const size_t vl = vsetvl_e16m1(packn); + const size_t vl = __riscv_vsetvl_e16m1(packn); int w = bottom_blob.w; @@ -31,7 +31,7 @@ static void convdw5x5s1_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, { Mat out = top_blob.channel(g); - vfloat16m1_t _bias0 = bias ? vle16_v_f16m1(bias + g * packn, vl) : vfmv_v_f_f16m1((__fp16)0.f, vl); + vfloat16m1_t _bias0 = bias ? __riscv_vle16_v_f16m1(bias + g * packn, vl) : __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl); const __fp16* k0 = kernel.row(g); @@ -56,139 +56,139 @@ static void convdw5x5s1_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, vfloat16m1_t _sum0 = _bias0; vfloat16m1_t _sum1 = _bias0; - vfloat16m1_t _r00 = vle16_v_f16m1(r0, vl); - vfloat16m1_t _r01 = vle16_v_f16m1(r0 + packn, vl); - vfloat16m1_t _r02 = vle16_v_f16m1(r0 + packn * 2, vl); - vfloat16m1_t _r03 = vle16_v_f16m1(r0 + packn * 3, vl); - vfloat16m1_t _r04 = vle16_v_f16m1(r0 + packn * 4, vl); - - vfloat16m1_t _k00 = vle16_v_f16m1(k0, vl); - vfloat16m1_t _k01 = vle16_v_f16m1(k0 + packn, vl); - vfloat16m1_t _k02 = vle16_v_f16m1(k0 + packn * 2, vl); - vfloat16m1_t _k03 = vle16_v_f16m1(k0 + packn * 3, vl); - vfloat16m1_t _k04 = vle16_v_f16m1(k0 + packn * 4, vl); + vfloat16m1_t _r00 = __riscv_vle16_v_f16m1(r0, vl); + vfloat16m1_t _r01 = __riscv_vle16_v_f16m1(r0 + packn, vl); + vfloat16m1_t _r02 = __riscv_vle16_v_f16m1(r0 + packn * 2, vl); + vfloat16m1_t _r03 = __riscv_vle16_v_f16m1(r0 + packn * 3, vl); + vfloat16m1_t _r04 = __riscv_vle16_v_f16m1(r0 + packn * 4, vl); + + vfloat16m1_t _k00 = __riscv_vle16_v_f16m1(k0, vl); + vfloat16m1_t _k01 = __riscv_vle16_v_f16m1(k0 + packn, vl); + vfloat16m1_t _k02 = __riscv_vle16_v_f16m1(k0 + packn * 2, vl); + vfloat16m1_t _k03 = __riscv_vle16_v_f16m1(k0 + packn * 3, vl); + vfloat16m1_t _k04 = __riscv_vle16_v_f16m1(k0 + packn * 4, vl); k0 += packn * 5; - _sum0 = vfmacc_vv_f16m1(_sum0, _k00, _r00, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, _k01, _r01, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, _k02, _r02, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, _k03, _r03, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, _k04, _r04, vl); - - vfloat16m1_t _r10 = vle16_v_f16m1(r1, vl); - vfloat16m1_t _r11 = vle16_v_f16m1(r1 + packn, vl); - vfloat16m1_t _r12 = vle16_v_f16m1(r1 + packn * 2, vl); - vfloat16m1_t _r13 = vle16_v_f16m1(r1 + packn * 3, vl); - vfloat16m1_t _r14 = vle16_v_f16m1(r1 + packn * 4, vl); - - _sum1 = vfmacc_vv_f16m1(_sum1, _k00, _r10, vl); - _sum1 = vfmacc_vv_f16m1(_sum1, _k01, _r11, vl); - _sum1 = vfmacc_vv_f16m1(_sum1, _k02, _r12, vl); - _sum1 = vfmacc_vv_f16m1(_sum1, _k03, _r13, vl); - _sum1 = vfmacc_vv_f16m1(_sum1, _k04, _r14, vl); - - vfloat16m1_t _k10 = vle16_v_f16m1(k0, vl); - vfloat16m1_t _k11 = vle16_v_f16m1(k0 + packn, vl); - vfloat16m1_t _k12 = vle16_v_f16m1(k0 + packn * 2, vl); - vfloat16m1_t _k13 = vle16_v_f16m1(k0 + packn * 3, vl); - vfloat16m1_t _k14 = vle16_v_f16m1(k0 + packn * 4, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k00, _r00, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k01, _r01, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k02, _r02, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k03, _r03, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k04, _r04, vl); + + vfloat16m1_t _r10 = __riscv_vle16_v_f16m1(r1, vl); + vfloat16m1_t _r11 = __riscv_vle16_v_f16m1(r1 + packn, vl); + vfloat16m1_t _r12 = __riscv_vle16_v_f16m1(r1 + packn * 2, vl); + vfloat16m1_t _r13 = __riscv_vle16_v_f16m1(r1 + packn * 3, vl); + vfloat16m1_t _r14 = __riscv_vle16_v_f16m1(r1 + packn * 4, vl); + + _sum1 = __riscv_vfmacc_vv_f16m1(_sum1, _k00, _r10, vl); + _sum1 = __riscv_vfmacc_vv_f16m1(_sum1, _k01, _r11, vl); + _sum1 = __riscv_vfmacc_vv_f16m1(_sum1, _k02, _r12, vl); + _sum1 = __riscv_vfmacc_vv_f16m1(_sum1, _k03, _r13, vl); + _sum1 = __riscv_vfmacc_vv_f16m1(_sum1, _k04, _r14, vl); + + vfloat16m1_t _k10 = __riscv_vle16_v_f16m1(k0, vl); + vfloat16m1_t _k11 = __riscv_vle16_v_f16m1(k0 + packn, vl); + vfloat16m1_t _k12 = __riscv_vle16_v_f16m1(k0 + packn * 2, vl); + vfloat16m1_t _k13 = __riscv_vle16_v_f16m1(k0 + packn * 3, vl); + vfloat16m1_t _k14 = __riscv_vle16_v_f16m1(k0 + packn * 4, vl); k0 += packn * 5; - _sum0 = vfmacc_vv_f16m1(_sum0, _k10, _r10, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, _k11, _r11, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, _k12, _r12, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, _k13, _r13, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, _k14, _r14, vl); - - vfloat16m1_t _r20 = vle16_v_f16m1(r2, vl); - vfloat16m1_t _r21 = vle16_v_f16m1(r2 + packn, vl); - vfloat16m1_t _r22 = vle16_v_f16m1(r2 + packn * 2, vl); - vfloat16m1_t _r23 = vle16_v_f16m1(r2 + packn * 3, vl); - vfloat16m1_t _r24 = vle16_v_f16m1(r2 + packn * 4, vl); - - _sum1 = vfmacc_vv_f16m1(_sum1, _k10, _r20, vl); - _sum1 = vfmacc_vv_f16m1(_sum1, _k11, _r21, vl); - _sum1 = vfmacc_vv_f16m1(_sum1, _k12, _r22, vl); - _sum1 = vfmacc_vv_f16m1(_sum1, _k13, _r23, vl); - _sum1 = vfmacc_vv_f16m1(_sum1, _k14, _r24, vl); - - vfloat16m1_t _k20 = vle16_v_f16m1(k0, vl); - vfloat16m1_t _k21 = vle16_v_f16m1(k0 + packn, vl); - vfloat16m1_t _k22 = vle16_v_f16m1(k0 + packn * 2, vl); - vfloat16m1_t _k23 = vle16_v_f16m1(k0 + packn * 3, vl); - vfloat16m1_t _k24 = vle16_v_f16m1(k0 + packn * 4, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k10, _r10, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k11, _r11, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k12, _r12, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k13, _r13, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k14, _r14, vl); + + vfloat16m1_t _r20 = __riscv_vle16_v_f16m1(r2, vl); + vfloat16m1_t _r21 = __riscv_vle16_v_f16m1(r2 + packn, vl); + vfloat16m1_t _r22 = __riscv_vle16_v_f16m1(r2 + packn * 2, vl); + vfloat16m1_t _r23 = __riscv_vle16_v_f16m1(r2 + packn * 3, vl); + vfloat16m1_t _r24 = __riscv_vle16_v_f16m1(r2 + packn * 4, vl); + + _sum1 = __riscv_vfmacc_vv_f16m1(_sum1, _k10, _r20, vl); + _sum1 = __riscv_vfmacc_vv_f16m1(_sum1, _k11, _r21, vl); + _sum1 = __riscv_vfmacc_vv_f16m1(_sum1, _k12, _r22, vl); + _sum1 = __riscv_vfmacc_vv_f16m1(_sum1, _k13, _r23, vl); + _sum1 = __riscv_vfmacc_vv_f16m1(_sum1, _k14, _r24, vl); + + vfloat16m1_t _k20 = __riscv_vle16_v_f16m1(k0, vl); + vfloat16m1_t _k21 = __riscv_vle16_v_f16m1(k0 + packn, vl); + vfloat16m1_t _k22 = __riscv_vle16_v_f16m1(k0 + packn * 2, vl); + vfloat16m1_t _k23 = __riscv_vle16_v_f16m1(k0 + packn * 3, vl); + vfloat16m1_t _k24 = __riscv_vle16_v_f16m1(k0 + packn * 4, vl); k0 += packn * 5; - _sum0 = vfmacc_vv_f16m1(_sum0, _k20, _r20, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, _k21, _r21, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, _k22, _r22, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, _k23, _r23, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, _k24, _r24, vl); - - vfloat16m1_t _r30 = vle16_v_f16m1(r3, vl); - vfloat16m1_t _r31 = vle16_v_f16m1(r3 + packn, vl); - vfloat16m1_t _r32 = vle16_v_f16m1(r3 + packn * 2, vl); - vfloat16m1_t _r33 = vle16_v_f16m1(r3 + packn * 3, vl); - vfloat16m1_t _r34 = vle16_v_f16m1(r3 + packn * 4, vl); - - _sum1 = vfmacc_vv_f16m1(_sum1, _k20, _r30, vl); - _sum1 = vfmacc_vv_f16m1(_sum1, _k21, _r31, vl); - _sum1 = vfmacc_vv_f16m1(_sum1, _k22, _r32, vl); - _sum1 = vfmacc_vv_f16m1(_sum1, _k23, _r33, vl); - _sum1 = vfmacc_vv_f16m1(_sum1, _k24, _r34, vl); - - vfloat16m1_t _k30 = vle16_v_f16m1(k0, vl); - vfloat16m1_t _k31 = vle16_v_f16m1(k0 + packn, vl); - vfloat16m1_t _k32 = vle16_v_f16m1(k0 + packn * 2, vl); - vfloat16m1_t _k33 = vle16_v_f16m1(k0 + packn * 3, vl); - vfloat16m1_t _k34 = vle16_v_f16m1(k0 + packn * 4, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k20, _r20, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k21, _r21, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k22, _r22, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k23, _r23, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k24, _r24, vl); + + vfloat16m1_t _r30 = __riscv_vle16_v_f16m1(r3, vl); + vfloat16m1_t _r31 = __riscv_vle16_v_f16m1(r3 + packn, vl); + vfloat16m1_t _r32 = __riscv_vle16_v_f16m1(r3 + packn * 2, vl); + vfloat16m1_t _r33 = __riscv_vle16_v_f16m1(r3 + packn * 3, vl); + vfloat16m1_t _r34 = __riscv_vle16_v_f16m1(r3 + packn * 4, vl); + + _sum1 = __riscv_vfmacc_vv_f16m1(_sum1, _k20, _r30, vl); + _sum1 = __riscv_vfmacc_vv_f16m1(_sum1, _k21, _r31, vl); + _sum1 = __riscv_vfmacc_vv_f16m1(_sum1, _k22, _r32, vl); + _sum1 = __riscv_vfmacc_vv_f16m1(_sum1, _k23, _r33, vl); + _sum1 = __riscv_vfmacc_vv_f16m1(_sum1, _k24, _r34, vl); + + vfloat16m1_t _k30 = __riscv_vle16_v_f16m1(k0, vl); + vfloat16m1_t _k31 = __riscv_vle16_v_f16m1(k0 + packn, vl); + vfloat16m1_t _k32 = __riscv_vle16_v_f16m1(k0 + packn * 2, vl); + vfloat16m1_t _k33 = __riscv_vle16_v_f16m1(k0 + packn * 3, vl); + vfloat16m1_t _k34 = __riscv_vle16_v_f16m1(k0 + packn * 4, vl); k0 += packn * 5; - _sum0 = vfmacc_vv_f16m1(_sum0, _k30, _r30, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, _k31, _r31, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, _k32, _r32, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, _k33, _r33, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, _k34, _r34, vl); - - vfloat16m1_t _r40 = vle16_v_f16m1(r4, vl); - vfloat16m1_t _r41 = vle16_v_f16m1(r4 + packn, vl); - vfloat16m1_t _r42 = vle16_v_f16m1(r4 + packn * 2, vl); - vfloat16m1_t _r43 = vle16_v_f16m1(r4 + packn * 3, vl); - vfloat16m1_t _r44 = vle16_v_f16m1(r4 + packn * 4, vl); - - _sum1 = vfmacc_vv_f16m1(_sum1, _k30, _r40, vl); - _sum1 = vfmacc_vv_f16m1(_sum1, _k31, _r41, vl); - _sum1 = vfmacc_vv_f16m1(_sum1, _k32, _r42, vl); - _sum1 = vfmacc_vv_f16m1(_sum1, _k33, _r43, vl); - _sum1 = vfmacc_vv_f16m1(_sum1, _k34, _r44, vl); - - vfloat16m1_t _k40 = vle16_v_f16m1(k0, vl); - vfloat16m1_t _k41 = vle16_v_f16m1(k0 + packn, vl); - vfloat16m1_t _k42 = vle16_v_f16m1(k0 + packn * 2, vl); - vfloat16m1_t _k43 = vle16_v_f16m1(k0 + packn * 3, vl); - vfloat16m1_t _k44 = vle16_v_f16m1(k0 + packn * 4, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k30, _r30, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k31, _r31, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k32, _r32, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k33, _r33, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k34, _r34, vl); + + vfloat16m1_t _r40 = __riscv_vle16_v_f16m1(r4, vl); + vfloat16m1_t _r41 = __riscv_vle16_v_f16m1(r4 + packn, vl); + vfloat16m1_t _r42 = __riscv_vle16_v_f16m1(r4 + packn * 2, vl); + vfloat16m1_t _r43 = __riscv_vle16_v_f16m1(r4 + packn * 3, vl); + vfloat16m1_t _r44 = __riscv_vle16_v_f16m1(r4 + packn * 4, vl); + + _sum1 = __riscv_vfmacc_vv_f16m1(_sum1, _k30, _r40, vl); + _sum1 = __riscv_vfmacc_vv_f16m1(_sum1, _k31, _r41, vl); + _sum1 = __riscv_vfmacc_vv_f16m1(_sum1, _k32, _r42, vl); + _sum1 = __riscv_vfmacc_vv_f16m1(_sum1, _k33, _r43, vl); + _sum1 = __riscv_vfmacc_vv_f16m1(_sum1, _k34, _r44, vl); + + vfloat16m1_t _k40 = __riscv_vle16_v_f16m1(k0, vl); + vfloat16m1_t _k41 = __riscv_vle16_v_f16m1(k0 + packn, vl); + vfloat16m1_t _k42 = __riscv_vle16_v_f16m1(k0 + packn * 2, vl); + vfloat16m1_t _k43 = __riscv_vle16_v_f16m1(k0 + packn * 3, vl); + vfloat16m1_t _k44 = __riscv_vle16_v_f16m1(k0 + packn * 4, vl); k0 -= packn * 20; - _sum0 = vfmacc_vv_f16m1(_sum0, _k40, _r40, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, _k41, _r41, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, _k42, _r42, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, _k43, _r43, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, _k44, _r44, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k40, _r40, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k41, _r41, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k42, _r42, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k43, _r43, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k44, _r44, vl); - vfloat16m1_t _r50 = vle16_v_f16m1(r5, vl); - vfloat16m1_t _r51 = vle16_v_f16m1(r5 + packn, vl); - vfloat16m1_t _r52 = vle16_v_f16m1(r5 + packn * 2, vl); - vfloat16m1_t _r53 = vle16_v_f16m1(r5 + packn * 3, vl); - vfloat16m1_t _r54 = vle16_v_f16m1(r5 + packn * 4, vl); + vfloat16m1_t _r50 = __riscv_vle16_v_f16m1(r5, vl); + vfloat16m1_t _r51 = __riscv_vle16_v_f16m1(r5 + packn, vl); + vfloat16m1_t _r52 = __riscv_vle16_v_f16m1(r5 + packn * 2, vl); + vfloat16m1_t _r53 = __riscv_vle16_v_f16m1(r5 + packn * 3, vl); + vfloat16m1_t _r54 = __riscv_vle16_v_f16m1(r5 + packn * 4, vl); - _sum1 = vfmacc_vv_f16m1(_sum1, _k40, _r50, vl); - _sum1 = vfmacc_vv_f16m1(_sum1, _k41, _r51, vl); - _sum1 = vfmacc_vv_f16m1(_sum1, _k42, _r52, vl); - _sum1 = vfmacc_vv_f16m1(_sum1, _k43, _r53, vl); - _sum1 = vfmacc_vv_f16m1(_sum1, _k44, _r54, vl); + _sum1 = __riscv_vfmacc_vv_f16m1(_sum1, _k40, _r50, vl); + _sum1 = __riscv_vfmacc_vv_f16m1(_sum1, _k41, _r51, vl); + _sum1 = __riscv_vfmacc_vv_f16m1(_sum1, _k42, _r52, vl); + _sum1 = __riscv_vfmacc_vv_f16m1(_sum1, _k43, _r53, vl); + _sum1 = __riscv_vfmacc_vv_f16m1(_sum1, _k44, _r54, vl); - vse16_v_f16m1(outptr0, _sum0, vl); - vse16_v_f16m1(outptr1, _sum1, vl); + __riscv_vse16_v_f16m1(outptr0, _sum0, vl); + __riscv_vse16_v_f16m1(outptr1, _sum1, vl); outptr0 += packn; outptr1 += packn; @@ -218,102 +218,102 @@ static void convdw5x5s1_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, { vfloat16m1_t _sum0 = _bias0; - vfloat16m1_t _r00 = vle16_v_f16m1(r0, vl); - vfloat16m1_t _r01 = vle16_v_f16m1(r0 + packn, vl); - vfloat16m1_t _r02 = vle16_v_f16m1(r0 + packn * 2, vl); - vfloat16m1_t _r03 = vle16_v_f16m1(r0 + packn * 3, vl); - vfloat16m1_t _r04 = vle16_v_f16m1(r0 + packn * 4, vl); - - vfloat16m1_t _k00 = vle16_v_f16m1(k0, vl); - vfloat16m1_t _k01 = vle16_v_f16m1(k0 + packn, vl); - vfloat16m1_t _k02 = vle16_v_f16m1(k0 + packn * 2, vl); - vfloat16m1_t _k03 = vle16_v_f16m1(k0 + packn * 3, vl); - vfloat16m1_t _k04 = vle16_v_f16m1(k0 + packn * 4, vl); + vfloat16m1_t _r00 = __riscv_vle16_v_f16m1(r0, vl); + vfloat16m1_t _r01 = __riscv_vle16_v_f16m1(r0 + packn, vl); + vfloat16m1_t _r02 = __riscv_vle16_v_f16m1(r0 + packn * 2, vl); + vfloat16m1_t _r03 = __riscv_vle16_v_f16m1(r0 + packn * 3, vl); + vfloat16m1_t _r04 = __riscv_vle16_v_f16m1(r0 + packn * 4, vl); + + vfloat16m1_t _k00 = __riscv_vle16_v_f16m1(k0, vl); + vfloat16m1_t _k01 = __riscv_vle16_v_f16m1(k0 + packn, vl); + vfloat16m1_t _k02 = __riscv_vle16_v_f16m1(k0 + packn * 2, vl); + vfloat16m1_t _k03 = __riscv_vle16_v_f16m1(k0 + packn * 3, vl); + vfloat16m1_t _k04 = __riscv_vle16_v_f16m1(k0 + packn * 4, vl); k0 += packn * 5; - _sum0 = vfmacc_vv_f16m1(_sum0, _k00, _r00, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, _k01, _r01, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, _k02, _r02, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, _k03, _r03, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, _k04, _r04, vl); - - vfloat16m1_t _r10 = vle16_v_f16m1(r1, vl); - vfloat16m1_t _r11 = vle16_v_f16m1(r1 + packn, vl); - vfloat16m1_t _r12 = vle16_v_f16m1(r1 + packn * 2, vl); - vfloat16m1_t _r13 = vle16_v_f16m1(r1 + packn * 3, vl); - vfloat16m1_t _r14 = vle16_v_f16m1(r1 + packn * 4, vl); - - vfloat16m1_t _k10 = vle16_v_f16m1(k0, vl); - vfloat16m1_t _k11 = vle16_v_f16m1(k0 + packn, vl); - vfloat16m1_t _k12 = vle16_v_f16m1(k0 + packn * 2, vl); - vfloat16m1_t _k13 = vle16_v_f16m1(k0 + packn * 3, vl); - vfloat16m1_t _k14 = vle16_v_f16m1(k0 + packn * 4, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k00, _r00, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k01, _r01, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k02, _r02, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k03, _r03, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k04, _r04, vl); + + vfloat16m1_t _r10 = __riscv_vle16_v_f16m1(r1, vl); + vfloat16m1_t _r11 = __riscv_vle16_v_f16m1(r1 + packn, vl); + vfloat16m1_t _r12 = __riscv_vle16_v_f16m1(r1 + packn * 2, vl); + vfloat16m1_t _r13 = __riscv_vle16_v_f16m1(r1 + packn * 3, vl); + vfloat16m1_t _r14 = __riscv_vle16_v_f16m1(r1 + packn * 4, vl); + + vfloat16m1_t _k10 = __riscv_vle16_v_f16m1(k0, vl); + vfloat16m1_t _k11 = __riscv_vle16_v_f16m1(k0 + packn, vl); + vfloat16m1_t _k12 = __riscv_vle16_v_f16m1(k0 + packn * 2, vl); + vfloat16m1_t _k13 = __riscv_vle16_v_f16m1(k0 + packn * 3, vl); + vfloat16m1_t _k14 = __riscv_vle16_v_f16m1(k0 + packn * 4, vl); k0 += packn * 5; - _sum0 = vfmacc_vv_f16m1(_sum0, _k10, _r10, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, _k11, _r11, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, _k12, _r12, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, _k13, _r13, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, _k14, _r14, vl); - - vfloat16m1_t _r20 = vle16_v_f16m1(r2, vl); - vfloat16m1_t _r21 = vle16_v_f16m1(r2 + packn, vl); - vfloat16m1_t _r22 = vle16_v_f16m1(r2 + packn * 2, vl); - vfloat16m1_t _r23 = vle16_v_f16m1(r2 + packn * 3, vl); - vfloat16m1_t _r24 = vle16_v_f16m1(r2 + packn * 4, vl); - - vfloat16m1_t _k20 = vle16_v_f16m1(k0, vl); - vfloat16m1_t _k21 = vle16_v_f16m1(k0 + packn, vl); - vfloat16m1_t _k22 = vle16_v_f16m1(k0 + packn * 2, vl); - vfloat16m1_t _k23 = vle16_v_f16m1(k0 + packn * 3, vl); - vfloat16m1_t _k24 = vle16_v_f16m1(k0 + packn * 4, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k10, _r10, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k11, _r11, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k12, _r12, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k13, _r13, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k14, _r14, vl); + + vfloat16m1_t _r20 = __riscv_vle16_v_f16m1(r2, vl); + vfloat16m1_t _r21 = __riscv_vle16_v_f16m1(r2 + packn, vl); + vfloat16m1_t _r22 = __riscv_vle16_v_f16m1(r2 + packn * 2, vl); + vfloat16m1_t _r23 = __riscv_vle16_v_f16m1(r2 + packn * 3, vl); + vfloat16m1_t _r24 = __riscv_vle16_v_f16m1(r2 + packn * 4, vl); + + vfloat16m1_t _k20 = __riscv_vle16_v_f16m1(k0, vl); + vfloat16m1_t _k21 = __riscv_vle16_v_f16m1(k0 + packn, vl); + vfloat16m1_t _k22 = __riscv_vle16_v_f16m1(k0 + packn * 2, vl); + vfloat16m1_t _k23 = __riscv_vle16_v_f16m1(k0 + packn * 3, vl); + vfloat16m1_t _k24 = __riscv_vle16_v_f16m1(k0 + packn * 4, vl); k0 += packn * 5; - _sum0 = vfmacc_vv_f16m1(_sum0, _k20, _r20, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, _k21, _r21, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, _k22, _r22, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, _k23, _r23, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, _k24, _r24, vl); - - vfloat16m1_t _r30 = vle16_v_f16m1(r3, vl); - vfloat16m1_t _r31 = vle16_v_f16m1(r3 + packn, vl); - vfloat16m1_t _r32 = vle16_v_f16m1(r3 + packn * 2, vl); - vfloat16m1_t _r33 = vle16_v_f16m1(r3 + packn * 3, vl); - vfloat16m1_t _r34 = vle16_v_f16m1(r3 + packn * 4, vl); - - vfloat16m1_t _k30 = vle16_v_f16m1(k0, vl); - vfloat16m1_t _k31 = vle16_v_f16m1(k0 + packn, vl); - vfloat16m1_t _k32 = vle16_v_f16m1(k0 + packn * 2, vl); - vfloat16m1_t _k33 = vle16_v_f16m1(k0 + packn * 3, vl); - vfloat16m1_t _k34 = vle16_v_f16m1(k0 + packn * 4, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k20, _r20, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k21, _r21, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k22, _r22, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k23, _r23, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k24, _r24, vl); + + vfloat16m1_t _r30 = __riscv_vle16_v_f16m1(r3, vl); + vfloat16m1_t _r31 = __riscv_vle16_v_f16m1(r3 + packn, vl); + vfloat16m1_t _r32 = __riscv_vle16_v_f16m1(r3 + packn * 2, vl); + vfloat16m1_t _r33 = __riscv_vle16_v_f16m1(r3 + packn * 3, vl); + vfloat16m1_t _r34 = __riscv_vle16_v_f16m1(r3 + packn * 4, vl); + + vfloat16m1_t _k30 = __riscv_vle16_v_f16m1(k0, vl); + vfloat16m1_t _k31 = __riscv_vle16_v_f16m1(k0 + packn, vl); + vfloat16m1_t _k32 = __riscv_vle16_v_f16m1(k0 + packn * 2, vl); + vfloat16m1_t _k33 = __riscv_vle16_v_f16m1(k0 + packn * 3, vl); + vfloat16m1_t _k34 = __riscv_vle16_v_f16m1(k0 + packn * 4, vl); k0 += packn * 5; - _sum0 = vfmacc_vv_f16m1(_sum0, _k30, _r30, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, _k31, _r31, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, _k32, _r32, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, _k33, _r33, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, _k34, _r34, vl); - - vfloat16m1_t _r40 = vle16_v_f16m1(r4, vl); - vfloat16m1_t _r41 = vle16_v_f16m1(r4 + packn, vl); - vfloat16m1_t _r42 = vle16_v_f16m1(r4 + packn * 2, vl); - vfloat16m1_t _r43 = vle16_v_f16m1(r4 + packn * 3, vl); - vfloat16m1_t _r44 = vle16_v_f16m1(r4 + packn * 4, vl); - - vfloat16m1_t _k40 = vle16_v_f16m1(k0, vl); - vfloat16m1_t _k41 = vle16_v_f16m1(k0 + packn, vl); - vfloat16m1_t _k42 = vle16_v_f16m1(k0 + packn * 2, vl); - vfloat16m1_t _k43 = vle16_v_f16m1(k0 + packn * 3, vl); - vfloat16m1_t _k44 = vle16_v_f16m1(k0 + packn * 4, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k30, _r30, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k31, _r31, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k32, _r32, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k33, _r33, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k34, _r34, vl); + + vfloat16m1_t _r40 = __riscv_vle16_v_f16m1(r4, vl); + vfloat16m1_t _r41 = __riscv_vle16_v_f16m1(r4 + packn, vl); + vfloat16m1_t _r42 = __riscv_vle16_v_f16m1(r4 + packn * 2, vl); + vfloat16m1_t _r43 = __riscv_vle16_v_f16m1(r4 + packn * 3, vl); + vfloat16m1_t _r44 = __riscv_vle16_v_f16m1(r4 + packn * 4, vl); + + vfloat16m1_t _k40 = __riscv_vle16_v_f16m1(k0, vl); + vfloat16m1_t _k41 = __riscv_vle16_v_f16m1(k0 + packn, vl); + vfloat16m1_t _k42 = __riscv_vle16_v_f16m1(k0 + packn * 2, vl); + vfloat16m1_t _k43 = __riscv_vle16_v_f16m1(k0 + packn * 3, vl); + vfloat16m1_t _k44 = __riscv_vle16_v_f16m1(k0 + packn * 4, vl); k0 -= packn * 20; - _sum0 = vfmacc_vv_f16m1(_sum0, _k40, _r40, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, _k41, _r41, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, _k42, _r42, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, _k43, _r43, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, _k44, _r44, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k40, _r40, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k41, _r41, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k42, _r42, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k43, _r43, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k44, _r44, vl); - vse16_v_f16m1(outptr0, _sum0, vl); + __riscv_vse16_v_f16m1(outptr0, _sum0, vl); outptr0 += packn; @@ -336,7 +336,7 @@ static void convdw5x5s1_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, static void convdw5x5s2_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) { const int packn = csrr_vlenb() / 2; - const size_t vl = vsetvl_e16m1(packn); + const size_t vl = __riscv_vsetvl_e16m1(packn); int w = bottom_blob.w; @@ -354,7 +354,7 @@ static void convdw5x5s2_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, { Mat out = top_blob.channel(g); - vfloat16m1_t _bias0 = bias ? vle16_v_f16m1(bias + g * packn, vl) : vfmv_v_f_f16m1((__fp16)0.f, vl); + vfloat16m1_t _bias0 = bias ? __riscv_vle16_v_f16m1(bias + g * packn, vl) : __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl); const __fp16* k0 = kernel.row(g); @@ -376,102 +376,102 @@ static void convdw5x5s2_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, { vfloat16m1_t _sum0 = _bias0; - vfloat16m1_t _r00 = vle16_v_f16m1(r0, vl); - vfloat16m1_t _r01 = vle16_v_f16m1(r0 + packn, vl); - vfloat16m1_t _r02 = vle16_v_f16m1(r0 + packn * 2, vl); - vfloat16m1_t _r03 = vle16_v_f16m1(r0 + packn * 3, vl); - vfloat16m1_t _r04 = vle16_v_f16m1(r0 + packn * 4, vl); - - vfloat16m1_t _k00 = vle16_v_f16m1(k0, vl); - vfloat16m1_t _k01 = vle16_v_f16m1(k0 + packn, vl); - vfloat16m1_t _k02 = vle16_v_f16m1(k0 + packn * 2, vl); - vfloat16m1_t _k03 = vle16_v_f16m1(k0 + packn * 3, vl); - vfloat16m1_t _k04 = vle16_v_f16m1(k0 + packn * 4, vl); + vfloat16m1_t _r00 = __riscv_vle16_v_f16m1(r0, vl); + vfloat16m1_t _r01 = __riscv_vle16_v_f16m1(r0 + packn, vl); + vfloat16m1_t _r02 = __riscv_vle16_v_f16m1(r0 + packn * 2, vl); + vfloat16m1_t _r03 = __riscv_vle16_v_f16m1(r0 + packn * 3, vl); + vfloat16m1_t _r04 = __riscv_vle16_v_f16m1(r0 + packn * 4, vl); + + vfloat16m1_t _k00 = __riscv_vle16_v_f16m1(k0, vl); + vfloat16m1_t _k01 = __riscv_vle16_v_f16m1(k0 + packn, vl); + vfloat16m1_t _k02 = __riscv_vle16_v_f16m1(k0 + packn * 2, vl); + vfloat16m1_t _k03 = __riscv_vle16_v_f16m1(k0 + packn * 3, vl); + vfloat16m1_t _k04 = __riscv_vle16_v_f16m1(k0 + packn * 4, vl); k0 += packn * 5; - _sum0 = vfmacc_vv_f16m1(_sum0, _k00, _r00, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, _k01, _r01, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, _k02, _r02, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, _k03, _r03, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, _k04, _r04, vl); - - vfloat16m1_t _r10 = vle16_v_f16m1(r1, vl); - vfloat16m1_t _r11 = vle16_v_f16m1(r1 + packn, vl); - vfloat16m1_t _r12 = vle16_v_f16m1(r1 + packn * 2, vl); - vfloat16m1_t _r13 = vle16_v_f16m1(r1 + packn * 3, vl); - vfloat16m1_t _r14 = vle16_v_f16m1(r1 + packn * 4, vl); - - vfloat16m1_t _k10 = vle16_v_f16m1(k0, vl); - vfloat16m1_t _k11 = vle16_v_f16m1(k0 + packn, vl); - vfloat16m1_t _k12 = vle16_v_f16m1(k0 + packn * 2, vl); - vfloat16m1_t _k13 = vle16_v_f16m1(k0 + packn * 3, vl); - vfloat16m1_t _k14 = vle16_v_f16m1(k0 + packn * 4, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k00, _r00, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k01, _r01, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k02, _r02, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k03, _r03, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k04, _r04, vl); + + vfloat16m1_t _r10 = __riscv_vle16_v_f16m1(r1, vl); + vfloat16m1_t _r11 = __riscv_vle16_v_f16m1(r1 + packn, vl); + vfloat16m1_t _r12 = __riscv_vle16_v_f16m1(r1 + packn * 2, vl); + vfloat16m1_t _r13 = __riscv_vle16_v_f16m1(r1 + packn * 3, vl); + vfloat16m1_t _r14 = __riscv_vle16_v_f16m1(r1 + packn * 4, vl); + + vfloat16m1_t _k10 = __riscv_vle16_v_f16m1(k0, vl); + vfloat16m1_t _k11 = __riscv_vle16_v_f16m1(k0 + packn, vl); + vfloat16m1_t _k12 = __riscv_vle16_v_f16m1(k0 + packn * 2, vl); + vfloat16m1_t _k13 = __riscv_vle16_v_f16m1(k0 + packn * 3, vl); + vfloat16m1_t _k14 = __riscv_vle16_v_f16m1(k0 + packn * 4, vl); k0 += packn * 5; - _sum0 = vfmacc_vv_f16m1(_sum0, _k10, _r10, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, _k11, _r11, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, _k12, _r12, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, _k13, _r13, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, _k14, _r14, vl); - - vfloat16m1_t _r20 = vle16_v_f16m1(r2, vl); - vfloat16m1_t _r21 = vle16_v_f16m1(r2 + packn, vl); - vfloat16m1_t _r22 = vle16_v_f16m1(r2 + packn * 2, vl); - vfloat16m1_t _r23 = vle16_v_f16m1(r2 + packn * 3, vl); - vfloat16m1_t _r24 = vle16_v_f16m1(r2 + packn * 4, vl); - - vfloat16m1_t _k20 = vle16_v_f16m1(k0, vl); - vfloat16m1_t _k21 = vle16_v_f16m1(k0 + packn, vl); - vfloat16m1_t _k22 = vle16_v_f16m1(k0 + packn * 2, vl); - vfloat16m1_t _k23 = vle16_v_f16m1(k0 + packn * 3, vl); - vfloat16m1_t _k24 = vle16_v_f16m1(k0 + packn * 4, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k10, _r10, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k11, _r11, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k12, _r12, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k13, _r13, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k14, _r14, vl); + + vfloat16m1_t _r20 = __riscv_vle16_v_f16m1(r2, vl); + vfloat16m1_t _r21 = __riscv_vle16_v_f16m1(r2 + packn, vl); + vfloat16m1_t _r22 = __riscv_vle16_v_f16m1(r2 + packn * 2, vl); + vfloat16m1_t _r23 = __riscv_vle16_v_f16m1(r2 + packn * 3, vl); + vfloat16m1_t _r24 = __riscv_vle16_v_f16m1(r2 + packn * 4, vl); + + vfloat16m1_t _k20 = __riscv_vle16_v_f16m1(k0, vl); + vfloat16m1_t _k21 = __riscv_vle16_v_f16m1(k0 + packn, vl); + vfloat16m1_t _k22 = __riscv_vle16_v_f16m1(k0 + packn * 2, vl); + vfloat16m1_t _k23 = __riscv_vle16_v_f16m1(k0 + packn * 3, vl); + vfloat16m1_t _k24 = __riscv_vle16_v_f16m1(k0 + packn * 4, vl); k0 += packn * 5; - _sum0 = vfmacc_vv_f16m1(_sum0, _k20, _r20, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, _k21, _r21, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, _k22, _r22, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, _k23, _r23, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, _k24, _r24, vl); - - vfloat16m1_t _r30 = vle16_v_f16m1(r3, vl); - vfloat16m1_t _r31 = vle16_v_f16m1(r3 + packn, vl); - vfloat16m1_t _r32 = vle16_v_f16m1(r3 + packn * 2, vl); - vfloat16m1_t _r33 = vle16_v_f16m1(r3 + packn * 3, vl); - vfloat16m1_t _r34 = vle16_v_f16m1(r3 + packn * 4, vl); - - vfloat16m1_t _k30 = vle16_v_f16m1(k0, vl); - vfloat16m1_t _k31 = vle16_v_f16m1(k0 + packn, vl); - vfloat16m1_t _k32 = vle16_v_f16m1(k0 + packn * 2, vl); - vfloat16m1_t _k33 = vle16_v_f16m1(k0 + packn * 3, vl); - vfloat16m1_t _k34 = vle16_v_f16m1(k0 + packn * 4, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k20, _r20, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k21, _r21, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k22, _r22, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k23, _r23, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k24, _r24, vl); + + vfloat16m1_t _r30 = __riscv_vle16_v_f16m1(r3, vl); + vfloat16m1_t _r31 = __riscv_vle16_v_f16m1(r3 + packn, vl); + vfloat16m1_t _r32 = __riscv_vle16_v_f16m1(r3 + packn * 2, vl); + vfloat16m1_t _r33 = __riscv_vle16_v_f16m1(r3 + packn * 3, vl); + vfloat16m1_t _r34 = __riscv_vle16_v_f16m1(r3 + packn * 4, vl); + + vfloat16m1_t _k30 = __riscv_vle16_v_f16m1(k0, vl); + vfloat16m1_t _k31 = __riscv_vle16_v_f16m1(k0 + packn, vl); + vfloat16m1_t _k32 = __riscv_vle16_v_f16m1(k0 + packn * 2, vl); + vfloat16m1_t _k33 = __riscv_vle16_v_f16m1(k0 + packn * 3, vl); + vfloat16m1_t _k34 = __riscv_vle16_v_f16m1(k0 + packn * 4, vl); k0 += packn * 5; - _sum0 = vfmacc_vv_f16m1(_sum0, _k30, _r30, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, _k31, _r31, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, _k32, _r32, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, _k33, _r33, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, _k34, _r34, vl); - - vfloat16m1_t _r40 = vle16_v_f16m1(r4, vl); - vfloat16m1_t _r41 = vle16_v_f16m1(r4 + packn, vl); - vfloat16m1_t _r42 = vle16_v_f16m1(r4 + packn * 2, vl); - vfloat16m1_t _r43 = vle16_v_f16m1(r4 + packn * 3, vl); - vfloat16m1_t _r44 = vle16_v_f16m1(r4 + packn * 4, vl); - - vfloat16m1_t _k40 = vle16_v_f16m1(k0, vl); - vfloat16m1_t _k41 = vle16_v_f16m1(k0 + packn, vl); - vfloat16m1_t _k42 = vle16_v_f16m1(k0 + packn * 2, vl); - vfloat16m1_t _k43 = vle16_v_f16m1(k0 + packn * 3, vl); - vfloat16m1_t _k44 = vle16_v_f16m1(k0 + packn * 4, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k30, _r30, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k31, _r31, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k32, _r32, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k33, _r33, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k34, _r34, vl); + + vfloat16m1_t _r40 = __riscv_vle16_v_f16m1(r4, vl); + vfloat16m1_t _r41 = __riscv_vle16_v_f16m1(r4 + packn, vl); + vfloat16m1_t _r42 = __riscv_vle16_v_f16m1(r4 + packn * 2, vl); + vfloat16m1_t _r43 = __riscv_vle16_v_f16m1(r4 + packn * 3, vl); + vfloat16m1_t _r44 = __riscv_vle16_v_f16m1(r4 + packn * 4, vl); + + vfloat16m1_t _k40 = __riscv_vle16_v_f16m1(k0, vl); + vfloat16m1_t _k41 = __riscv_vle16_v_f16m1(k0 + packn, vl); + vfloat16m1_t _k42 = __riscv_vle16_v_f16m1(k0 + packn * 2, vl); + vfloat16m1_t _k43 = __riscv_vle16_v_f16m1(k0 + packn * 3, vl); + vfloat16m1_t _k44 = __riscv_vle16_v_f16m1(k0 + packn * 4, vl); k0 -= packn * 20; - _sum0 = vfmacc_vv_f16m1(_sum0, _k40, _r40, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, _k41, _r41, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, _k42, _r42, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, _k43, _r43, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, _k44, _r44, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k40, _r40, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k41, _r41, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k42, _r42, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k43, _r43, vl); + _sum0 = __riscv_vfmacc_vv_f16m1(_sum0, _k44, _r44, vl); - vse16_v_f16m1(outptr0, _sum0, vl); + __riscv_vse16_v_f16m1(outptr0, _sum0, vl); outptr0 += packn; diff --git a/src/layer/riscv/convolutiondepthwise_riscv.cpp b/src/layer/riscv/convolutiondepthwise_riscv.cpp index 6a0eb04cda5..6014b647193 100644 --- a/src/layer/riscv/convolutiondepthwise_riscv.cpp +++ b/src/layer/riscv/convolutiondepthwise_riscv.cpp @@ -14,9 +14,6 @@ #include "convolutiondepthwise_riscv.h" -#include "cpu.h" -#include "layer_type.h" - #if __riscv_vector #include #endif // __riscv_vector @@ -24,6 +21,9 @@ #include "riscv_activation.h" #include "riscv_usability.h" +#include "cpu.h" +#include "layer_type.h" + namespace ncnn { #include "convolutiondepthwise_3x3.h" @@ -31,21 +31,20 @@ namespace ncnn { #if __riscv_vector #include "convolutiondepthwise_3x3_packn.h" #include "convolutiondepthwise_5x5_packn.h" - -#if __riscv_zfh -#include "convolutiondepthwise_3x3_packn_fp16s.h" -#include "convolutiondepthwise_5x5_packn_fp16s.h" -#endif #endif // __riscv_vector ConvolutionDepthWise_riscv::ConvolutionDepthWise_riscv() { #if __riscv_vector support_packing = true; -#if __riscv_zfh - support_fp16_storage = true; -#endif #endif // __riscv_vector +#if NCNN_ZFH +#if __riscv_vector + support_fp16_storage = cpu_support_riscv_zvfh(); +#else + support_fp16_storage = cpu_support_riscv_zfh(); +#endif +#endif activation = 0; } @@ -65,8 +64,8 @@ int ConvolutionDepthWise_riscv::create_pipeline(const Option& opt) } #endif -#if __riscv_vector && __riscv_zfh - if (opt.use_fp16_storage) +#if NCNN_ZFH + if (support_fp16_storage && opt.use_fp16_storage) { return create_pipeline_fp16s(opt); } @@ -264,9 +263,9 @@ int ConvolutionDepthWise_riscv::forward(const Mat& bottom_blob, Mat& top_blob, c } #endif +#if NCNN_ZFH int elembits = bottom_blob.elembits(); -#if __riscv_vector && __riscv_zfh if (opt.use_fp16_storage && elembits == 16) { if (opt.use_fp16_arithmetic) @@ -278,7 +277,7 @@ int ConvolutionDepthWise_riscv::forward(const Mat& bottom_blob, Mat& top_blob, c #if __riscv_vector const int packn = csrr_vlenb() / 4; - const size_t vl = vsetvl_e32m1(packn); + const size_t vl = __riscv_vsetvl_e32m1(packn); #endif int w = bottom_blob.w; @@ -389,25 +388,25 @@ int ConvolutionDepthWise_riscv::forward(const Mat& bottom_blob, Mat& top_blob, c { for (int j = 0; j < outw; j++) { - vfloat32m1_t _sum = vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _sum = __riscv_vfmv_v_f_f32m1(0.f, vl); if (bias_term) { - _sum = vle32_v_f32m1((const float*)bias_data + g * packn, vl); + _sum = __riscv_vle32_v_f32m1((const float*)bias_data + g * packn, vl); } const float* sptr = m.row(i * stride_h) + j * stride_w * packn; for (int k = 0; k < maxk; k++) { - vfloat32m1_t _val = vle32_v_f32m1(sptr + space_ofs[k] * packn, vl); - vfloat32m1_t _w = vle32_v_f32m1(kptr + k * packn, vl); - _sum = vfmacc_vv_f32m1(_sum, _val, _w, vl); + vfloat32m1_t _val = __riscv_vle32_v_f32m1(sptr + space_ofs[k] * packn, vl); + vfloat32m1_t _w = __riscv_vle32_v_f32m1(kptr + k * packn, vl); + _sum = __riscv_vfmacc_vv_f32m1(_sum, _val, _w, vl); } _sum = activation_ps(_sum, activation_type, activation_params, vl); - vse32_v_f32m1(outptr + j * packn, _sum, vl); + __riscv_vse32_v_f32m1(outptr + j * packn, _sum, vl); } outptr += outw * packn; @@ -573,7 +572,7 @@ int ConvolutionDepthWise_riscv::forward(const std::vector& bottom_blobs, st return -100; #if NCNN_RVV - if (opt.use_fp16_storage && cpu_support_riscv_v() && cpu_support_riscv_zfh() && weight_data_flattened.elembits() == 16) + if (opt.use_fp16_storage && cpu_support_riscv_zvfh() && weight_data_flattened.elembits() == 16) { Mat weight_data_flattened_fp32; cast_float16_to_float32(weight_data_flattened, weight_data_flattened_fp32, opt); @@ -595,7 +594,7 @@ int ConvolutionDepthWise_riscv::forward(const std::vector& bottom_blobs, st return -100; #if NCNN_RVV - if (opt.use_fp16_storage && cpu_support_riscv_v() && cpu_support_riscv_zfh() && bias_data_flattened.elembits() == 16) + if (opt.use_fp16_storage && cpu_support_riscv_zvfh() && bias_data_flattened.elembits() == 16) { Mat bias_data_flattened_fp32; cast_float16_to_float32(bias_data_flattened, bias_data_flattened_fp32, opt); @@ -650,511 +649,4 @@ int ConvolutionDepthWise_riscv::forward(const std::vector& bottom_blobs, st return 0; } -#if __riscv_vector && __riscv_zfh -int ConvolutionDepthWise_riscv::create_pipeline_fp16s(const Option& opt) -{ - const int packn = csrr_vlenb() / 2; - - const int maxk = kernel_w * kernel_h; - int channels = (weight_data_size / group) / maxk / (num_output / group) * group; - - // depth-wise - if (channels == group && group == num_output) - { - int elempack = 1; - if (opt.use_packing_layout) - { - elempack = channels % packn == 0 ? packn : 1; - } - - // packn - if (elempack == packn) - { - Mat weight_data_r2 = weight_data.reshape(maxk, group); - Mat weight_data_r2_packed; - convert_packing(weight_data_r2, weight_data_r2_packed, packn, opt); - - ncnn::cast_float32_to_float16(weight_data_r2_packed, weight_data_tm, opt); - } - - if (elempack == 1) - { - ncnn::cast_float32_to_float16(weight_data, weight_data_tm, opt); - } - - ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt); - - if (opt.lightmode) - weight_data.release(); - - return 0; - } - - // group convolution - create_group_ops(opt); - - if (opt.lightmode) - weight_data.release(); - - return 0; -} - -int ConvolutionDepthWise_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const -{ - const int packn = csrr_vlenb() / 2; - const size_t vl = vsetvl_e16m1(packn); - - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - size_t elemsize = bottom_blob.elemsize; - int elempack = bottom_blob.elempack; - - const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; - const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; - - Mat bottom_blob_bordered; - make_padding(bottom_blob, bottom_blob_bordered, opt); - if (bottom_blob_bordered.empty()) - return -100; - - w = bottom_blob_bordered.w; - h = bottom_blob_bordered.h; - - int outw = (w - kernel_extent_w) / stride_w + 1; - int outh = (h - kernel_extent_h) / stride_h + 1; - int out_elempack = (opt.use_packing_layout && num_output % packn == 0) ? packn : 1; - size_t out_elemsize = elemsize / elempack * out_elempack; - - top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - // depth-wise - if (channels * elempack == group && group == num_output) - { - if (elempack == packn) - { - { - const int maxk = kernel_w * kernel_h; - - // kernel offsets - std::vector _space_ofs(maxk); - int* space_ofs = &_space_ofs[0]; - { - int p1 = 0; - int p2 = 0; - int gap = w * dilation_h - kernel_w * dilation_w; - for (int i = 0; i < kernel_h; i++) - { - for (int j = 0; j < kernel_w; j++) - { - space_ofs[p1] = p2; - p1++; - p2 += dilation_w; - } - p2 += gap; - } - } - - #pragma omp parallel for num_threads(opt.num_threads) - for (int g = 0; g < channels; g++) - { - __fp16* outptr = top_blob.channel(g); - const __fp16* kptr = (const __fp16*)weight_data_tm + maxk * g * packn; - const Mat m = bottom_blob_bordered.channel(g); - - for (int i = 0; i < outh; i++) - { - for (int j = 0; j < outw; j++) - { - vfloat32m2_t _sum = vfmv_v_f_f32m2(0.f, vl); - - if (bias_term) - { - _sum = vle32_v_f32m2((const float*)bias_data + g * packn, vl); - } - - const __fp16* sptr = m.row(i * stride_h) + j * stride_w * packn; - - for (int k = 0; k < maxk; k++) - { - vfloat16m1_t _val = vle16_v_f16m1(sptr + space_ofs[k] * packn, vl); - vfloat16m1_t _w = vle16_v_f16m1(kptr + k * packn, vl); - _sum = vfwmacc_vv_f32m2(_sum, _val, _w, vl); - } - - _sum = activation_ps(_sum, activation_type, activation_params, vl); - - vse16_v_f16m1(outptr + j * packn, vfncvt_f_f_w_f16m1(_sum, vl), vl); - } - - outptr += outw * packn; - } - } - } - } - - if (elempack == 1) - { - { - const int maxk = kernel_w * kernel_h; - - // kernel offsets - std::vector _space_ofs(maxk); - int* space_ofs = &_space_ofs[0]; - { - int p1 = 0; - int p2 = 0; - int gap = w * dilation_h - kernel_w * dilation_w; - for (int i = 0; i < kernel_h; i++) - { - for (int j = 0; j < kernel_w; j++) - { - space_ofs[p1] = p2; - p1++; - p2 += dilation_w; - } - p2 += gap; - } - } - - #pragma omp parallel for num_threads(opt.num_threads) - for (int g = 0; g < group; g++) - { - __fp16* outptr = top_blob.channel(g); - const __fp16* kptr = (const __fp16*)weight_data_tm + maxk * g; - const Mat m = bottom_blob_bordered.channel(g); - - for (int i = 0; i < outh; i++) - { - for (int j = 0; j < outw; j++) - { - float sum = 0.f; - - if (bias_term) - sum = bias_data[g]; - - const __fp16* sptr = m.row(i * stride_h) + j * stride_w; - - for (int k = 0; k < maxk; k++) - { - float val = (float)sptr[space_ofs[k]]; - float w = (float)kptr[k]; - sum += val * w; - } - - sum = activation_ss(sum, activation_type, activation_params); - - outptr[j] = (__fp16)sum; - } - - outptr += outw; - } - } - } - } - - return 0; - } - - // group convolution - const int channels_g = channels * elempack / group; - const int num_output_g = num_output / group; - - int g_elempack = (opt.use_packing_layout && channels_g % packn == 0) ? packn : 1; - int out_g_elempack = (opt.use_packing_layout && num_output_g % packn == 0) ? packn : 1; - - // unpacking - Mat bottom_blob_bordered_unpacked = bottom_blob_bordered; - if (elempack > g_elempack) - { - Option opt_p = opt; - opt_p.blob_allocator = opt.workspace_allocator; - convert_packing(bottom_blob_bordered, bottom_blob_bordered_unpacked, 1, opt_p); - } - - Mat top_blob_unpacked = top_blob; - if (out_g_elempack < out_elempack) - { - top_blob_unpacked.create(outw, outh, num_output, out_elemsize / out_elempack, 1, opt.workspace_allocator); - if (top_blob_unpacked.empty()) - return -100; - } - - for (int g = 0; g < group; g++) - { - const Mat bottom_blob_bordered_g = bottom_blob_bordered_unpacked.channel_range(channels_g * g / g_elempack, channels_g / g_elempack); - Mat top_blob_g = top_blob_unpacked.channel_range(num_output_g * g / out_g_elempack, num_output_g / out_g_elempack); - - const ncnn::Layer* op = group_ops[g]; - - Option opt_g = opt; - opt_g.blob_allocator = top_blob_unpacked.allocator; - - // forward - op->forward(bottom_blob_bordered_g, top_blob_g, opt_g); - } - - // packing - if (out_g_elempack < out_elempack) - { - convert_packing(top_blob_unpacked, top_blob, out_elempack, opt); - } - else - { - top_blob = top_blob_unpacked; - } - - return 0; -} - -int ConvolutionDepthWise_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const -{ - const int packn = csrr_vlenb() / 2; - const size_t vl = vsetvl_e16m1(packn); - - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - size_t elemsize = bottom_blob.elemsize; - int elempack = bottom_blob.elempack; - - const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; - const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; - - Mat bottom_blob_bordered; - make_padding(bottom_blob, bottom_blob_bordered, opt); - if (bottom_blob_bordered.empty()) - return -100; - - w = bottom_blob_bordered.w; - h = bottom_blob_bordered.h; - - int outw = (w - kernel_extent_w) / stride_w + 1; - int outh = (h - kernel_extent_h) / stride_h + 1; - int out_elempack = (opt.use_packing_layout && num_output % packn == 0) ? packn : 1; - size_t out_elemsize = elemsize / elempack * out_elempack; - - top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - // depth-wise - if (channels * elempack == group && group == num_output) - { - if (elempack == packn) - { - if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) - { - convdw3x3s1_packn_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt); - - if (activation) - { - activation->forward_inplace(top_blob, opt); - } - } - else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) - { - convdw3x3s2_packn_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt); - - if (activation) - { - activation->forward_inplace(top_blob, opt); - } - } - else if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) - { - convdw5x5s1_packn_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt); - - if (activation) - { - activation->forward_inplace(top_blob, opt); - } - } - else if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) - { - convdw5x5s2_packn_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt); - - if (activation) - { - activation->forward_inplace(top_blob, opt); - } - } - else - { - const int maxk = kernel_w * kernel_h; - - // kernel offsets - std::vector _space_ofs(maxk); - int* space_ofs = &_space_ofs[0]; - { - int p1 = 0; - int p2 = 0; - int gap = w * dilation_h - kernel_w * dilation_w; - for (int i = 0; i < kernel_h; i++) - { - for (int j = 0; j < kernel_w; j++) - { - space_ofs[p1] = p2; - p1++; - p2 += dilation_w; - } - p2 += gap; - } - } - - #pragma omp parallel for num_threads(opt.num_threads) - for (int g = 0; g < channels; g++) - { - __fp16* outptr = top_blob.channel(g); - const __fp16* kptr = (const __fp16*)weight_data_tm + maxk * g * packn; - const Mat m = bottom_blob_bordered.channel(g); - - for (int i = 0; i < outh; i++) - { - for (int j = 0; j < outw; j++) - { - vfloat16m1_t _sum = vfmv_v_f_f16m1((__fp16)0.f, vl); - - if (bias_term) - { - _sum = vle16_v_f16m1((const __fp16*)bias_data_fp16 + g * packn, vl); - } - - const __fp16* sptr = m.row(i * stride_h) + j * stride_w * packn; - - for (int k = 0; k < maxk; k++) - { - vfloat16m1_t _val = vle16_v_f16m1(sptr + space_ofs[k] * packn, vl); - vfloat16m1_t _w = vle16_v_f16m1(kptr + k * packn, vl); - _sum = vfmacc_vv_f16m1(_sum, _val, _w, vl); - } - - _sum = activation_ps(_sum, activation_type, activation_params, vl); - - vse16_v_f16m1(outptr + j * packn, _sum, vl); - } - - outptr += outw * packn; - } - } - } - } - - if (elempack == 1) - { - { - const int maxk = kernel_w * kernel_h; - - // kernel offsets - std::vector _space_ofs(maxk); - int* space_ofs = &_space_ofs[0]; - { - int p1 = 0; - int p2 = 0; - int gap = w * dilation_h - kernel_w * dilation_w; - for (int i = 0; i < kernel_h; i++) - { - for (int j = 0; j < kernel_w; j++) - { - space_ofs[p1] = p2; - p1++; - p2 += dilation_w; - } - p2 += gap; - } - } - - #pragma omp parallel for num_threads(opt.num_threads) - for (int g = 0; g < group; g++) - { - __fp16* outptr = top_blob.channel(g); - const __fp16* kptr = (const __fp16*)weight_data_tm + maxk * g; - const Mat m = bottom_blob_bordered.channel(g); - - for (int i = 0; i < outh; i++) - { - for (int j = 0; j < outw; j++) - { - float sum = 0.f; - - if (bias_term) - sum = bias_data[g]; - - const __fp16* sptr = m.row(i * stride_h) + j * stride_w; - - for (int k = 0; k < maxk; k++) - { - __fp16 val = sptr[space_ofs[k]]; - __fp16 w = kptr[k]; - sum += val * w; - } - - sum = activation_ss(sum, activation_type, activation_params); - - outptr[j] = (__fp16)sum; - } - - outptr += outw; - } - } - } - } - - return 0; - } - - // group convolution - const int channels_g = channels * elempack / group; - const int num_output_g = num_output / group; - - int g_elempack = (opt.use_packing_layout && channels_g % packn == 0) ? packn : 1; - int out_g_elempack = (opt.use_packing_layout && num_output_g % packn == 0) ? packn : 1; - - // unpacking - Mat bottom_blob_bordered_unpacked = bottom_blob_bordered; - if (elempack > g_elempack) - { - Option opt_p = opt; - opt_p.blob_allocator = opt.workspace_allocator; - convert_packing(bottom_blob_bordered, bottom_blob_bordered_unpacked, g_elempack, opt_p); - } - - Mat top_blob_unpacked = top_blob; - if (out_g_elempack < out_elempack) - { - top_blob_unpacked.create(outw, outh, num_output / out_g_elempack, out_elemsize / out_elempack * out_g_elempack, out_g_elempack, opt.workspace_allocator); - if (top_blob_unpacked.empty()) - return -100; - } - - for (int g = 0; g < group; g++) - { - const Mat bottom_blob_bordered_g = bottom_blob_bordered_unpacked.channel_range(channels_g * g / g_elempack, channels_g / g_elempack); - Mat top_blob_g = top_blob_unpacked.channel_range(num_output_g * g / out_g_elempack, num_output_g / out_g_elempack); - - const ncnn::Layer* op = group_ops[g]; - - Option opt_g = opt; - opt_g.blob_allocator = top_blob_unpacked.allocator; - - // forward - op->forward(bottom_blob_bordered_g, top_blob_g, opt_g); - } - - // packing - if (out_g_elempack < out_elempack) - { - convert_packing(top_blob_unpacked, top_blob, out_elempack, opt); - } - else - { - top_blob = top_blob_unpacked; - } - - return 0; -} -#endif // __riscv_vector && __riscv_zfh - } // namespace ncnn diff --git a/src/layer/riscv/convolutiondepthwise_riscv.h b/src/layer/riscv/convolutiondepthwise_riscv.h index f9503975296..7f2c66d8e73 100644 --- a/src/layer/riscv/convolutiondepthwise_riscv.h +++ b/src/layer/riscv/convolutiondepthwise_riscv.h @@ -33,7 +33,7 @@ class ConvolutionDepthWise_riscv : public ConvolutionDepthWise protected: int create_group_ops(const Option& opt); -#if __riscv_vector && __riscv_zfh +#if NCNN_ZFH int create_pipeline_fp16s(const Option& opt); int forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; diff --git a/src/layer/riscv/convolutiondepthwise_riscv_zfh.cpp b/src/layer/riscv/convolutiondepthwise_riscv_zfh.cpp new file mode 100644 index 00000000000..9764166ca56 --- /dev/null +++ b/src/layer/riscv/convolutiondepthwise_riscv_zfh.cpp @@ -0,0 +1,580 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "convolutiondepthwise_riscv.h" + +#if __riscv_vector +#include +#endif // __riscv_vector + +#include "riscv_activation.h" +#include "riscv_usability.h" + +namespace ncnn { + +#if NCNN_ZFH +#if __riscv_zvfh +#include "convolutiondepthwise_3x3_packn_fp16s.h" +#include "convolutiondepthwise_5x5_packn_fp16s.h" +#endif +#endif // NCNN_ZFH + +#if NCNN_ZFH +int ConvolutionDepthWise_riscv::create_pipeline_fp16s(const Option& opt) +{ +#if __riscv_zvfh + const int packn = csrr_vlenb() / 2; +#endif // __riscv_zvfh + + const int maxk = kernel_w * kernel_h; + int channels = (weight_data_size / group) / maxk / (num_output / group) * group; + + // depth-wise + if (channels == group && group == num_output) + { + int elempack = 1; +#if __riscv_zvfh + if (opt.use_packing_layout) + { + elempack = channels % packn == 0 ? packn : 1; + } +#endif // __riscv_zvfh + +#if __riscv_zvfh + // packn + if (elempack == packn) + { + Mat weight_data_r2 = weight_data.reshape(maxk, group); + Mat weight_data_r2_packed; + convert_packing(weight_data_r2, weight_data_r2_packed, packn, opt); + + ncnn::cast_float32_to_float16(weight_data_r2_packed, weight_data_tm, opt); + } +#endif // __riscv_zvfh + + if (elempack == 1) + { + ncnn::cast_float32_to_float16(weight_data, weight_data_tm, opt); + } + + ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt); + + if (opt.lightmode) + weight_data.release(); + + return 0; + } + + // group convolution + create_group_ops(opt); + + if (opt.lightmode) + weight_data.release(); + + return 0; +} + +int ConvolutionDepthWise_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ +#if __riscv_zvfh + const int packn = csrr_vlenb() / 2; + const size_t vl = __riscv_vsetvl_e16m1(packn); +#endif // __riscv_zvfh + + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; + const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; + + Mat bottom_blob_bordered; + make_padding(bottom_blob, bottom_blob_bordered, opt); + if (bottom_blob_bordered.empty()) + return -100; + + w = bottom_blob_bordered.w; + h = bottom_blob_bordered.h; + + int outw = (w - kernel_extent_w) / stride_w + 1; + int outh = (h - kernel_extent_h) / stride_h + 1; + int out_elempack = 1; +#if __riscv_zvfh + if (opt.use_packing_layout) + { + out_elempack = num_output % packn == 0 ? packn : 1; + } +#endif // __riscv_zvfh + size_t out_elemsize = elemsize / elempack * out_elempack; + + top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + // depth-wise + if (channels * elempack == group && group == num_output) + { +#if __riscv_zvfh + if (elempack == packn) + { + { + const int maxk = kernel_w * kernel_h; + + // kernel offsets + std::vector _space_ofs(maxk); + int* space_ofs = &_space_ofs[0]; + { + int p1 = 0; + int p2 = 0; + int gap = w * dilation_h - kernel_w * dilation_w; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + space_ofs[p1] = p2; + p1++; + p2 += dilation_w; + } + p2 += gap; + } + } + + #pragma omp parallel for num_threads(opt.num_threads) + for (int g = 0; g < channels; g++) + { + __fp16* outptr = top_blob.channel(g); + const __fp16* kptr = (const __fp16*)weight_data_tm + maxk * g * packn; + const Mat m = bottom_blob_bordered.channel(g); + + for (int i = 0; i < outh; i++) + { + for (int j = 0; j < outw; j++) + { + vfloat32m2_t _sum = __riscv_vfmv_v_f_f32m2(0.f, vl); + + if (bias_term) + { + _sum = __riscv_vle32_v_f32m2((const float*)bias_data + g * packn, vl); + } + + const __fp16* sptr = m.row(i * stride_h) + j * stride_w * packn; + + for (int k = 0; k < maxk; k++) + { + vfloat16m1_t _val = __riscv_vle16_v_f16m1(sptr + space_ofs[k] * packn, vl); + vfloat16m1_t _w = __riscv_vle16_v_f16m1(kptr + k * packn, vl); + _sum = __riscv_vfwmacc_vv_f32m2(_sum, _val, _w, vl); + } + + _sum = activation_ps(_sum, activation_type, activation_params, vl); + + __riscv_vse16_v_f16m1(outptr + j * packn, __riscv_vfncvt_f_f_w_f16m1(_sum, vl), vl); + } + + outptr += outw * packn; + } + } + } + } +#endif // __riscv_zvfh + + if (elempack == 1) + { + { + const int maxk = kernel_w * kernel_h; + + // kernel offsets + std::vector _space_ofs(maxk); + int* space_ofs = &_space_ofs[0]; + { + int p1 = 0; + int p2 = 0; + int gap = w * dilation_h - kernel_w * dilation_w; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + space_ofs[p1] = p2; + p1++; + p2 += dilation_w; + } + p2 += gap; + } + } + + #pragma omp parallel for num_threads(opt.num_threads) + for (int g = 0; g < group; g++) + { + __fp16* outptr = top_blob.channel(g); + const __fp16* kptr = (const __fp16*)weight_data_tm + maxk * g; + const Mat m = bottom_blob_bordered.channel(g); + + for (int i = 0; i < outh; i++) + { + for (int j = 0; j < outw; j++) + { + float sum = 0.f; + + if (bias_term) + sum = bias_data[g]; + + const __fp16* sptr = m.row(i * stride_h) + j * stride_w; + + for (int k = 0; k < maxk; k++) + { + float val = (float)sptr[space_ofs[k]]; + float w = (float)kptr[k]; + sum += val * w; + } + + sum = activation_ss(sum, activation_type, activation_params); + + outptr[j] = (__fp16)sum; + } + + outptr += outw; + } + } + } + } + + return 0; + } + + // group convolution + const int channels_g = channels * elempack / group; + const int num_output_g = num_output / group; + + int g_elempack = 1; + int out_g_elempack = 1; +#if __riscv_zvfh + if (opt.use_packing_layout) + { + g_elempack = channels_g % packn == 0 ? packn : 1; + out_g_elempack = num_output_g % packn == 0 ? packn : 1; + } +#endif // __riscv_zvfh + + // unpacking + Mat bottom_blob_bordered_unpacked = bottom_blob_bordered; + if (elempack > g_elempack) + { + Option opt_p = opt; + opt_p.blob_allocator = opt.workspace_allocator; + convert_packing(bottom_blob_bordered, bottom_blob_bordered_unpacked, 1, opt_p); + } + + Mat top_blob_unpacked = top_blob; + if (out_g_elempack < out_elempack) + { + top_blob_unpacked.create(outw, outh, num_output, out_elemsize / out_elempack, 1, opt.workspace_allocator); + if (top_blob_unpacked.empty()) + return -100; + } + + for (int g = 0; g < group; g++) + { + const Mat bottom_blob_bordered_g = bottom_blob_bordered_unpacked.channel_range(channels_g * g / g_elempack, channels_g / g_elempack); + Mat top_blob_g = top_blob_unpacked.channel_range(num_output_g * g / out_g_elempack, num_output_g / out_g_elempack); + + const ncnn::Layer* op = group_ops[g]; + + Option opt_g = opt; + opt_g.blob_allocator = top_blob_unpacked.allocator; + + // forward + op->forward(bottom_blob_bordered_g, top_blob_g, opt_g); + } + + // packing + if (out_g_elempack < out_elempack) + { + convert_packing(top_blob_unpacked, top_blob, out_elempack, opt); + } + else + { + top_blob = top_blob_unpacked; + } + + return 0; +} + +int ConvolutionDepthWise_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ +#if __riscv_zvfh + const int packn = csrr_vlenb() / 2; + const size_t vl = __riscv_vsetvl_e16m1(packn); +#endif // __riscv_zvfh + + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; + const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; + + Mat bottom_blob_bordered; + make_padding(bottom_blob, bottom_blob_bordered, opt); + if (bottom_blob_bordered.empty()) + return -100; + + w = bottom_blob_bordered.w; + h = bottom_blob_bordered.h; + + int outw = (w - kernel_extent_w) / stride_w + 1; + int outh = (h - kernel_extent_h) / stride_h + 1; + int out_elempack = 1; +#if __riscv_zvfh + if (opt.use_packing_layout) + { + out_elempack = num_output % packn == 0 ? packn : 1; + } +#endif // __riscv_zvfh + size_t out_elemsize = elemsize / elempack * out_elempack; + + top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + // depth-wise + if (channels * elempack == group && group == num_output) + { +#if __riscv_zvfh + if (elempack == packn) + { + if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + { + convdw3x3s1_packn_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) + { + convdw3x3s2_packn_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + { + convdw5x5s1_packn_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) + { + convdw5x5s2_packn_fp16sa_rvv(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else + { + const int maxk = kernel_w * kernel_h; + + // kernel offsets + std::vector _space_ofs(maxk); + int* space_ofs = &_space_ofs[0]; + { + int p1 = 0; + int p2 = 0; + int gap = w * dilation_h - kernel_w * dilation_w; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + space_ofs[p1] = p2; + p1++; + p2 += dilation_w; + } + p2 += gap; + } + } + + #pragma omp parallel for num_threads(opt.num_threads) + for (int g = 0; g < channels; g++) + { + __fp16* outptr = top_blob.channel(g); + const __fp16* kptr = (const __fp16*)weight_data_tm + maxk * g * packn; + const Mat m = bottom_blob_bordered.channel(g); + + for (int i = 0; i < outh; i++) + { + for (int j = 0; j < outw; j++) + { + vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl); + + if (bias_term) + { + _sum = __riscv_vle16_v_f16m1((const __fp16*)bias_data_fp16 + g * packn, vl); + } + + const __fp16* sptr = m.row(i * stride_h) + j * stride_w * packn; + + for (int k = 0; k < maxk; k++) + { + vfloat16m1_t _val = __riscv_vle16_v_f16m1(sptr + space_ofs[k] * packn, vl); + vfloat16m1_t _w = __riscv_vle16_v_f16m1(kptr + k * packn, vl); + _sum = __riscv_vfmacc_vv_f16m1(_sum, _val, _w, vl); + } + + _sum = activation_ps(_sum, activation_type, activation_params, vl); + + __riscv_vse16_v_f16m1(outptr + j * packn, _sum, vl); + } + + outptr += outw * packn; + } + } + } + } +#endif // __riscv_zvfh + + if (elempack == 1) + { + { + const int maxk = kernel_w * kernel_h; + + // kernel offsets + std::vector _space_ofs(maxk); + int* space_ofs = &_space_ofs[0]; + { + int p1 = 0; + int p2 = 0; + int gap = w * dilation_h - kernel_w * dilation_w; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + space_ofs[p1] = p2; + p1++; + p2 += dilation_w; + } + p2 += gap; + } + } + + #pragma omp parallel for num_threads(opt.num_threads) + for (int g = 0; g < group; g++) + { + __fp16* outptr = top_blob.channel(g); + const __fp16* kptr = (const __fp16*)weight_data_tm + maxk * g; + const Mat m = bottom_blob_bordered.channel(g); + + for (int i = 0; i < outh; i++) + { + for (int j = 0; j < outw; j++) + { + float sum = 0.f; + + if (bias_term) + sum = bias_data[g]; + + const __fp16* sptr = m.row(i * stride_h) + j * stride_w; + + for (int k = 0; k < maxk; k++) + { + __fp16 val = sptr[space_ofs[k]]; + __fp16 w = kptr[k]; + sum += val * w; + } + + sum = activation_ss(sum, activation_type, activation_params); + + outptr[j] = (__fp16)sum; + } + + outptr += outw; + } + } + } + } + + return 0; + } + + // group convolution + const int channels_g = channels * elempack / group; + const int num_output_g = num_output / group; + + int g_elempack = 1; + int out_g_elempack = 1; +#if __riscv_zvfh + if (opt.use_packing_layout) + { + g_elempack = channels_g % packn == 0 ? packn : 1; + out_g_elempack = num_output_g % packn == 0 ? packn : 1; + } +#endif // __riscv_zvfh + + // unpacking + Mat bottom_blob_bordered_unpacked = bottom_blob_bordered; + if (elempack > g_elempack) + { + Option opt_p = opt; + opt_p.blob_allocator = opt.workspace_allocator; + convert_packing(bottom_blob_bordered, bottom_blob_bordered_unpacked, g_elempack, opt_p); + } + + Mat top_blob_unpacked = top_blob; + if (out_g_elempack < out_elempack) + { + top_blob_unpacked.create(outw, outh, num_output / out_g_elempack, out_elemsize / out_elempack * out_g_elempack, out_g_elempack, opt.workspace_allocator); + if (top_blob_unpacked.empty()) + return -100; + } + + for (int g = 0; g < group; g++) + { + const Mat bottom_blob_bordered_g = bottom_blob_bordered_unpacked.channel_range(channels_g * g / g_elempack, channels_g / g_elempack); + Mat top_blob_g = top_blob_unpacked.channel_range(num_output_g * g / out_g_elempack, num_output_g / out_g_elempack); + + const ncnn::Layer* op = group_ops[g]; + + Option opt_g = opt; + opt_g.blob_allocator = top_blob_unpacked.allocator; + + // forward + op->forward(bottom_blob_bordered_g, top_blob_g, opt_g); + } + + // packing + if (out_g_elempack < out_elempack) + { + convert_packing(top_blob_unpacked, top_blob, out_elempack, opt); + } + else + { + top_blob = top_blob_unpacked; + } + + return 0; +} +#endif // NCNN_ZFH + +} // namespace ncnn diff --git a/src/layer/riscv/crop_riscv.cpp b/src/layer/riscv/crop_riscv.cpp index 80e76fc47b4..95717c45900 100644 --- a/src/layer/riscv/crop_riscv.cpp +++ b/src/layer/riscv/crop_riscv.cpp @@ -20,16 +20,22 @@ #include "riscv_usability.h" +#include "cpu.h" + namespace ncnn { Crop_riscv::Crop_riscv() { #if __riscv_vector support_packing = true; -#if __riscv_zfh - support_fp16_storage = true; -#endif #endif // __riscv_vector +#if NCNN_ZFH +#if __riscv_vector + support_fp16_storage = cpu_support_riscv_zvfh(); +#else + support_fp16_storage = cpu_support_riscv_zfh(); +#endif +#endif #if NCNN_BF16 support_bf16_storage = true; @@ -43,7 +49,7 @@ static void crop_packn_rvv(const Mat& src, Mat& dst, int top, int left, int pack int h = dst.h; int right = src.w - dst.w - left; - const size_t vl = vsetvl_e32m1(packn); + const size_t vl = __riscv_vsetvl_e32m1(packn); const float* ptr = src.row(top) + left * packn; float* outptr = dst; @@ -52,8 +58,8 @@ static void crop_packn_rvv(const Mat& src, Mat& dst, int top, int left, int pack { for (int x = 0; x < w; x++) { - vfloat32m1_t _p = vle32_v_f32m1(ptr, vl); - vse32_v_f32m1(outptr, _p, vl); + vfloat32m1_t _p = __riscv_vle32_v_f32m1(ptr, vl); + __riscv_vse32_v_f32m1(outptr, _p, vl); ptr += packn; outptr += packn; @@ -69,7 +75,7 @@ static void crop_packn_bf16_fp16s_rvv(const Mat& src, Mat& dst, int top, int lef int h = dst.h; int right = src.w - dst.w - left; - const size_t vl = vsetvl_e16m1(packn); + const size_t vl = __riscv_vsetvl_e16m1(packn); const unsigned short* ptr = src.row(top) + left * packn; unsigned short* outptr = dst; @@ -78,8 +84,8 @@ static void crop_packn_bf16_fp16s_rvv(const Mat& src, Mat& dst, int top, int lef { for (int x = 0; x < w; x++) { - vuint16m1_t _p = vle16_v_u16m1(ptr, vl); - vse16_v_u16m1(outptr, _p, vl); + vuint16m1_t _p = __riscv_vle16_v_u16m1(ptr, vl); + __riscv_vse16_v_u16m1(outptr, _p, vl); ptr += packn; outptr += packn; diff --git a/src/layer/riscv/deconvolution_pack1ton.h b/src/layer/riscv/deconvolution_pack1ton.h index ec18f62c1c6..d96f1991edd 100644 --- a/src/layer/riscv/deconvolution_pack1ton.h +++ b/src/layer/riscv/deconvolution_pack1ton.h @@ -15,7 +15,7 @@ static void deconvolution_pack1ton_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_pack1ton, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt) { const int packn = csrr_vlenb() / 4; - const size_t vl = vsetvl_e32m1(packn); + const size_t vl = __riscv_vsetvl_e32m1(packn); int w = bottom_blob.w; int h = bottom_blob.h; @@ -42,11 +42,11 @@ static void deconvolution_pack1ton_rvv(const Mat& bottom_blob, Mat& top_blob, co { for (int j = 0; j < outw; j++) { - vfloat32m1_t _sum = vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _sum = __riscv_vfmv_v_f_f32m1(0.f, vl); if (bias_data_ptr) { - _sum = vle32_v_f32m1(bias_data_ptr + p * packn, vl); + _sum = __riscv_vle32_v_f32m1(bias_data_ptr + p * packn, vl); } const float* kptr = (const float*)weight_data_pack1ton + maxk * channels * p * packn; @@ -82,8 +82,8 @@ static void deconvolution_pack1ton_rvv(const Mat& bottom_blob, Mat& top_blob, co int k = y * kernel_w + x; - vfloat32m1_t _w = vle32_v_f32m1(kptr + k * packn, vl); - _sum = vfmacc_vf_f32m1(_sum, val, _w, vl); + vfloat32m1_t _w = __riscv_vle32_v_f32m1(kptr + k * packn, vl); + _sum = __riscv_vfmacc_vf_f32m1(_sum, val, _w, vl); } } @@ -92,7 +92,7 @@ static void deconvolution_pack1ton_rvv(const Mat& bottom_blob, Mat& top_blob, co _sum = activation_ps(_sum, activation_type, activation_params, vl); - vse32_v_f32m1(outptr + j * packn, _sum, vl); + __riscv_vse32_v_f32m1(outptr + j * packn, _sum, vl); } outptr += outw * packn; diff --git a/src/layer/riscv/deconvolution_pack1ton_fp16s.h b/src/layer/riscv/deconvolution_pack1ton_fp16s.h index 168c709217d..91ca4cf39f8 100644 --- a/src/layer/riscv/deconvolution_pack1ton_fp16s.h +++ b/src/layer/riscv/deconvolution_pack1ton_fp16s.h @@ -15,7 +15,7 @@ static void deconvolution_pack1ton_fp16s_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt) { const int packn = csrr_vlenb() / 2; - const size_t vl = vsetvl_e16m1(packn); + const size_t vl = __riscv_vsetvl_e16m1(packn); int w = bottom_blob.w; int h = bottom_blob.h; @@ -42,11 +42,11 @@ static void deconvolution_pack1ton_fp16s_rvv(const Mat& bottom_blob, Mat& top_bl { for (int j = 0; j < outw; j++) { - vfloat32m2_t _sum = vfmv_v_f_f32m2(0.f, vl); + vfloat32m2_t _sum = __riscv_vfmv_v_f_f32m2(0.f, vl); if (bias_data_ptr) { - _sum = vle32_v_f32m2(bias_data_ptr + p * packn, vl); + _sum = __riscv_vle32_v_f32m2(bias_data_ptr + p * packn, vl); } const __fp16* kptr = (const __fp16*)weight_data_fp16 + maxk * channels * p * packn; @@ -82,8 +82,8 @@ static void deconvolution_pack1ton_fp16s_rvv(const Mat& bottom_blob, Mat& top_bl int k = y * kernel_w + x; - vfloat16m1_t _w = vle16_v_f16m1(kptr + k * packn, vl); - _sum = vfwmacc_vf_f32m2(_sum, val, _w, vl); + vfloat16m1_t _w = __riscv_vle16_v_f16m1(kptr + k * packn, vl); + _sum = __riscv_vfwmacc_vf_f32m2(_sum, val, _w, vl); } } @@ -92,7 +92,7 @@ static void deconvolution_pack1ton_fp16s_rvv(const Mat& bottom_blob, Mat& top_bl _sum = activation_ps(_sum, activation_type, activation_params, vl); - vse16_v_f16m1(outptr + j * packn, vfncvt_f_f_w_f16m1(_sum, vl), vl); + __riscv_vse16_v_f16m1(outptr + j * packn, __riscv_vfncvt_f_f_w_f16m1(_sum, vl), vl); } outptr += outw * packn; @@ -103,7 +103,7 @@ static void deconvolution_pack1ton_fp16s_rvv(const Mat& bottom_blob, Mat& top_bl static void deconvolution_pack1ton_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data_fp16, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt) { const int packn = csrr_vlenb() / 2; - const size_t vl = vsetvl_e16m1(packn); + const size_t vl = __riscv_vsetvl_e16m1(packn); int w = bottom_blob.w; int h = bottom_blob.h; @@ -130,11 +130,11 @@ static void deconvolution_pack1ton_fp16sa_rvv(const Mat& bottom_blob, Mat& top_b { for (int j = 0; j < outw; j++) { - vfloat16m1_t _sum = vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1(0.f, vl); if (bias_data_ptr) { - _sum = vle16_v_f16m1(bias_data_ptr + p * packn, vl); + _sum = __riscv_vle16_v_f16m1(bias_data_ptr + p * packn, vl); } const __fp16* kptr = (const __fp16*)weight_data_fp16 + maxk * channels * p * packn; @@ -170,8 +170,8 @@ static void deconvolution_pack1ton_fp16sa_rvv(const Mat& bottom_blob, Mat& top_b int k = y * kernel_w + x; - vfloat16m1_t _w = vle16_v_f16m1(kptr + k * packn, vl); - _sum = vfmacc_vf_f16m1(_sum, val, _w, vl); + vfloat16m1_t _w = __riscv_vle16_v_f16m1(kptr + k * packn, vl); + _sum = __riscv_vfmacc_vf_f16m1(_sum, val, _w, vl); } } @@ -180,7 +180,7 @@ static void deconvolution_pack1ton_fp16sa_rvv(const Mat& bottom_blob, Mat& top_b _sum = activation_ps(_sum, activation_type, activation_params, vl); - vse16_v_f16m1(outptr + j * packn, _sum, vl); + __riscv_vse16_v_f16m1(outptr + j * packn, _sum, vl); } outptr += outw * packn; diff --git a/src/layer/riscv/deconvolution_packn.h b/src/layer/riscv/deconvolution_packn.h index 8cab6c3b0a1..a02e58ab6ae 100644 --- a/src/layer/riscv/deconvolution_packn.h +++ b/src/layer/riscv/deconvolution_packn.h @@ -15,7 +15,7 @@ static void deconvolution_packn_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_packn, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt) { const int packn = csrr_vlenb() / 4; - const size_t vl = vsetvl_e32m1(packn); + const size_t vl = __riscv_vsetvl_e32m1(packn); int w = bottom_blob.w; int h = bottom_blob.h; @@ -42,11 +42,11 @@ static void deconvolution_packn_rvv(const Mat& bottom_blob, Mat& top_blob, const { for (int j = 0; j < outw; j++) { - vfloat32m1_t _sum = vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _sum = __riscv_vfmv_v_f_f32m1(0.f, vl); if (bias_data_ptr) { - _sum = vle32_v_f32m1(bias_data_ptr + p * packn, vl); + _sum = __riscv_vle32_v_f32m1(bias_data_ptr + p * packn, vl); } const float* kptr = (const float*)weight_data_packn.channel(p); @@ -83,8 +83,8 @@ static void deconvolution_packn_rvv(const Mat& bottom_blob, Mat& top_blob, const for (int l = 0; l < packn; l++) { float val = *sptr++; - vfloat32m1_t _w0 = vle32_v_f32m1(kptr + k * packn * packn + packn * l, vl); - _sum = vfmacc_vf_f32m1(_sum, val, _w0, vl); + vfloat32m1_t _w0 = __riscv_vle32_v_f32m1(kptr + k * packn * packn + packn * l, vl); + _sum = __riscv_vfmacc_vf_f32m1(_sum, val, _w0, vl); } } } @@ -94,7 +94,7 @@ static void deconvolution_packn_rvv(const Mat& bottom_blob, Mat& top_blob, const _sum = activation_ps(_sum, activation_type, activation_params, vl); - vse32_v_f32m1(outptr + j * packn, _sum, vl); + __riscv_vse32_v_f32m1(outptr + j * packn, _sum, vl); } outptr += outw * packn; diff --git a/src/layer/riscv/deconvolution_packn_fp16s.h b/src/layer/riscv/deconvolution_packn_fp16s.h index 62fbd2eb731..b4bd5f80d26 100644 --- a/src/layer/riscv/deconvolution_packn_fp16s.h +++ b/src/layer/riscv/deconvolution_packn_fp16s.h @@ -15,7 +15,7 @@ static void deconvolution_packn_fp16s_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt) { const int packn = csrr_vlenb() / 2; - const size_t vl = vsetvl_e16m1(packn); + const size_t vl = __riscv_vsetvl_e16m1(packn); int w = bottom_blob.w; int h = bottom_blob.h; @@ -42,11 +42,11 @@ static void deconvolution_packn_fp16s_rvv(const Mat& bottom_blob, Mat& top_blob, { for (int j = 0; j < outw; j++) { - vfloat32m2_t _sum = vfmv_v_f_f32m2(0.f, vl); + vfloat32m2_t _sum = __riscv_vfmv_v_f_f32m2(0.f, vl); if (bias_data_ptr) { - _sum = vle32_v_f32m2(bias_data_ptr + p * packn, vl); + _sum = __riscv_vle32_v_f32m2(bias_data_ptr + p * packn, vl); } const __fp16* kptr = (const __fp16*)weight_data_fp16.channel(p); @@ -83,8 +83,8 @@ static void deconvolution_packn_fp16s_rvv(const Mat& bottom_blob, Mat& top_blob, for (int l = 0; l < packn; l++) { __fp16 val = *sptr++; - vfloat16m1_t _w0 = vle16_v_f16m1(kptr + k * packn * packn + packn * l, vl); - _sum = vfwmacc_vf_f32m2(_sum, val, _w0, vl); + vfloat16m1_t _w0 = __riscv_vle16_v_f16m1(kptr + k * packn * packn + packn * l, vl); + _sum = __riscv_vfwmacc_vf_f32m2(_sum, val, _w0, vl); } } } @@ -94,7 +94,7 @@ static void deconvolution_packn_fp16s_rvv(const Mat& bottom_blob, Mat& top_blob, _sum = activation_ps(_sum, activation_type, activation_params, vl); - vse16_v_f16m1(outptr + j * packn, vfncvt_f_f_w_f16m1(_sum, vl), vl); + __riscv_vse16_v_f16m1(outptr + j * packn, __riscv_vfncvt_f_f_w_f16m1(_sum, vl), vl); } outptr += outw * packn; @@ -105,7 +105,7 @@ static void deconvolution_packn_fp16s_rvv(const Mat& bottom_blob, Mat& top_blob, static void deconvolution_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data_fp16, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt) { const int packn = csrr_vlenb() / 2; - const size_t vl = vsetvl_e16m1(packn); + const size_t vl = __riscv_vsetvl_e16m1(packn); int w = bottom_blob.w; int h = bottom_blob.h; @@ -132,11 +132,11 @@ static void deconvolution_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob { for (int j = 0; j < outw; j++) { - vfloat16m1_t _sum = vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1(0.f, vl); if (bias_data_ptr) { - _sum = vle16_v_f16m1(bias_data_ptr + p * packn, vl); + _sum = __riscv_vle16_v_f16m1(bias_data_ptr + p * packn, vl); } const __fp16* kptr = (const __fp16*)weight_data_fp16.channel(p); @@ -173,8 +173,8 @@ static void deconvolution_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob for (int l = 0; l < packn; l++) { __fp16 val = *sptr++; - vfloat16m1_t _w0 = vle16_v_f16m1(kptr + k * packn * packn + packn * l, vl); - _sum = vfmacc_vf_f16m1(_sum, val, _w0, vl); + vfloat16m1_t _w0 = __riscv_vle16_v_f16m1(kptr + k * packn * packn + packn * l, vl); + _sum = __riscv_vfmacc_vf_f16m1(_sum, val, _w0, vl); } } } @@ -184,7 +184,7 @@ static void deconvolution_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob _sum = activation_ps(_sum, activation_type, activation_params, vl); - vse16_v_f16m1(outptr + j * packn, _sum, vl); + __riscv_vse16_v_f16m1(outptr + j * packn, _sum, vl); } outptr += outw * packn; diff --git a/src/layer/riscv/deconvolution_packnto1.h b/src/layer/riscv/deconvolution_packnto1.h index 2efa9b154d2..9fcffcae863 100644 --- a/src/layer/riscv/deconvolution_packnto1.h +++ b/src/layer/riscv/deconvolution_packnto1.h @@ -15,7 +15,7 @@ static void deconvolution_packnto1_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_packnto1, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt) { const int packn = csrr_vlenb() / 4; - const size_t vl = vsetvl_e32m1(packn); + const size_t vl = __riscv_vsetvl_e32m1(packn); int w = bottom_blob.w; int h = bottom_blob.h; @@ -49,7 +49,7 @@ static void deconvolution_packnto1_rvv(const Mat& bottom_blob, Mat& top_blob, co sum = bias_data_ptr[p]; } - vfloat32m1_t _sum = vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _sum = __riscv_vfmv_v_f_f32m1(0.f, vl); const float* kptr = (const float*)weight_data_packnto1 + maxk * channels * p * packn; @@ -82,16 +82,16 @@ static void deconvolution_packnto1_rvv(const Mat& bottom_blob, Mat& top_blob, co int k = y * kernel_w + x; - vfloat32m1_t _val = vle32_v_f32m1(sptr, vl); - vfloat32m1_t _w = vle32_v_f32m1(kptr + k * packn, vl); - _sum = vfmacc_vv_f32m1(_sum, _val, _w, vl); + vfloat32m1_t _val = __riscv_vle32_v_f32m1(sptr, vl); + vfloat32m1_t _w = __riscv_vle32_v_f32m1(kptr + k * packn, vl); + _sum = __riscv_vfmacc_vv_f32m1(_sum, _val, _w, vl); } } kptr += maxk * packn; } - sum = vfmv_f_s_f32m1_f32(vfredusum_vs_f32m1_f32m1(vfloat32m1_t(), _sum, vfmv_s_f_f32m1(vfloat32m1_t(), sum, vl), vl)); + sum = __riscv_vfmv_f_s_f32m1_f32(__riscv_vfredusum_vs_f32m1_f32m1(_sum, __riscv_vfmv_s_f_f32m1(sum, vl), vl)); sum = activation_ss(sum, activation_type, activation_params); diff --git a/src/layer/riscv/deconvolution_packnto1_fp16s.h b/src/layer/riscv/deconvolution_packnto1_fp16s.h index ab70100fb3b..59d1935cb15 100644 --- a/src/layer/riscv/deconvolution_packnto1_fp16s.h +++ b/src/layer/riscv/deconvolution_packnto1_fp16s.h @@ -15,7 +15,7 @@ static void deconvolution_packnto1_fp16s_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt) { const int packn = csrr_vlenb() / 2; - const size_t vl = vsetvl_e16m1(packn); + const size_t vl = __riscv_vsetvl_e16m1(packn); int w = bottom_blob.w; int h = bottom_blob.h; @@ -49,7 +49,7 @@ static void deconvolution_packnto1_fp16s_rvv(const Mat& bottom_blob, Mat& top_bl sum = bias_data_ptr[p]; } - vfloat32m2_t _sum = vfmv_v_f_f32m2(0.f, vl); + vfloat32m2_t _sum = __riscv_vfmv_v_f_f32m2(0.f, vl); const __fp16* kptr = (const __fp16*)weight_data_fp16 + maxk * channels * p * packn; @@ -82,9 +82,9 @@ static void deconvolution_packnto1_fp16s_rvv(const Mat& bottom_blob, Mat& top_bl int k = y * kernel_w + x; - vfloat16m1_t _val = vle16_v_f16m1(sptr, vl); - vfloat16m1_t _w = vle16_v_f16m1(kptr + k * packn, vl); - _sum = vfwmacc_vv_f32m2(_sum, _val, _w, vl); + vfloat16m1_t _val = __riscv_vle16_v_f16m1(sptr, vl); + vfloat16m1_t _w = __riscv_vle16_v_f16m1(kptr + k * packn, vl); + _sum = __riscv_vfwmacc_vv_f32m2(_sum, _val, _w, vl); } } @@ -94,13 +94,13 @@ static void deconvolution_packnto1_fp16s_rvv(const Mat& bottom_blob, Mat& top_bl #if C906 // TODO std::vector ss(packn); - vse32_v_f32m2((float*)ss.data(), _sum, vl); + __riscv_vse32_v_f32m2((float*)ss.data(), _sum, vl); for (int i = 0; i < packn; i++) { sum += ss[i]; } #else - sum = vfmv_f_s_f32m1_f32(vfredusum_vs_f32m2_f32m1(vfloat32m1_t(), _sum, vfmv_s_f_f32m1(vfloat32m1_t(), sum, vl), vl)); + sum = __riscv_vfmv_f_s_f32m1_f32(__riscv_vfredusum_vs_f32m2_f32m1(_sum, __riscv_vfmv_s_f_f32m1(sum, vl), vl)); #endif sum = activation_ss(sum, activation_type, activation_params); @@ -116,7 +116,7 @@ static void deconvolution_packnto1_fp16s_rvv(const Mat& bottom_blob, Mat& top_bl static void deconvolution_packnto1_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data_fp16, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt) { const int packn = csrr_vlenb() / 2; - const size_t vl = vsetvl_e16m1(packn); + const size_t vl = __riscv_vsetvl_e16m1(packn); int w = bottom_blob.w; int h = bottom_blob.h; @@ -150,7 +150,7 @@ static void deconvolution_packnto1_fp16sa_rvv(const Mat& bottom_blob, Mat& top_b sum = bias_data_ptr[p]; } - vfloat16m1_t _sum = vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1(0.f, vl); const __fp16* kptr = (const __fp16*)weight_data_fp16 + maxk * channels * p * packn; @@ -183,16 +183,16 @@ static void deconvolution_packnto1_fp16sa_rvv(const Mat& bottom_blob, Mat& top_b int k = y * kernel_w + x; - vfloat16m1_t _val = vle16_v_f16m1(sptr, vl); - vfloat16m1_t _w = vle16_v_f16m1(kptr + k * packn, vl); - _sum = vfmacc_vv_f16m1(_sum, _val, _w, vl); + vfloat16m1_t _val = __riscv_vle16_v_f16m1(sptr, vl); + vfloat16m1_t _w = __riscv_vle16_v_f16m1(kptr + k * packn, vl); + _sum = __riscv_vfmacc_vv_f16m1(_sum, _val, _w, vl); } } kptr += maxk * packn; } - sum = vfmv_f_s_f16m1_f16(vfredusum_vs_f16m1_f16m1(vfloat16m1_t(), _sum, vfmv_s_f_f16m1(vfloat16m1_t(), sum, vl), vl)); + sum = __riscv_vfmv_f_s_f16m1_f16(__riscv_vfredusum_vs_f16m1_f16m1(_sum, __riscv_vfmv_s_f_f16m1(sum, vl), vl)); sum = activation_ss(sum, activation_type, activation_params); diff --git a/src/layer/riscv/deconvolution_riscv.cpp b/src/layer/riscv/deconvolution_riscv.cpp index 3b6364e8020..f76274470ab 100644 --- a/src/layer/riscv/deconvolution_riscv.cpp +++ b/src/layer/riscv/deconvolution_riscv.cpp @@ -14,9 +14,6 @@ #include "deconvolution_riscv.h" -#include "cpu.h" -#include "layer_type.h" - #if __riscv_vector #include #endif // __riscv_vector @@ -24,29 +21,29 @@ #include "riscv_activation.h" #include "riscv_usability.h" +#include "cpu.h" +#include "layer_type.h" + namespace ncnn { #if __riscv_vector #include "deconvolution_packn.h" #include "deconvolution_pack1ton.h" #include "deconvolution_packnto1.h" - -#if __riscv_zfh -#include "deconvolution_fp16s.h" -#include "deconvolution_packn_fp16s.h" -#include "deconvolution_pack1ton_fp16s.h" -#include "deconvolution_packnto1_fp16s.h" -#endif #endif // __riscv_vector Deconvolution_riscv::Deconvolution_riscv() { #if __riscv_vector support_packing = true; -#if __riscv_zfh - support_fp16_storage = true; -#endif #endif // __riscv_vector +#if NCNN_ZFH +#if __riscv_vector + support_fp16_storage = cpu_support_riscv_zvfh(); +#else + support_fp16_storage = cpu_support_riscv_zfh(); +#endif +#endif } int Deconvolution_riscv::create_pipeline(const Option& opt) @@ -54,8 +51,8 @@ int Deconvolution_riscv::create_pipeline(const Option& opt) if (dynamic_weight) return 0; -#if __riscv_vector && __riscv_zfh - if (opt.use_fp16_storage) +#if NCNN_ZFH + if (support_fp16_storage && opt.use_fp16_storage) { return create_pipeline_fp16s(opt); } @@ -161,9 +158,9 @@ int Deconvolution_riscv::destroy_pipeline(const Option& opt) int Deconvolution_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const { +#if NCNN_ZFH int elembits = bottom_blob.elembits(); -#if __riscv_vector && __riscv_zfh if (opt.use_fp16_storage && elembits == 16) { if (opt.use_fp16_arithmetic) @@ -337,7 +334,7 @@ int Deconvolution_riscv::forward(const std::vector& bottom_blobs, std::vect return -100; #if NCNN_RVV - if (opt.use_fp16_storage && cpu_support_riscv_v() && cpu_support_riscv_zfh() && weight_data_flattened.elembits() == 16) + if (opt.use_fp16_storage && cpu_support_riscv_zvfh() && weight_data_flattened.elembits() == 16) { Mat weight_data_flattened_fp32; cast_float16_to_float32(weight_data_flattened, weight_data_flattened_fp32, opt); @@ -388,7 +385,7 @@ int Deconvolution_riscv::forward(const std::vector& bottom_blobs, std::vect return -100; #if NCNN_RVV - if (opt.use_fp16_storage && cpu_support_riscv_v() && cpu_support_riscv_zfh() && bias_data_flattened.elembits() == 16) + if (opt.use_fp16_storage && cpu_support_riscv_zvfh() && bias_data_flattened.elembits() == 16) { Mat bias_data_flattened_fp32; cast_float16_to_float32(bias_data_flattened, bias_data_flattened_fp32, opt); @@ -444,240 +441,4 @@ int Deconvolution_riscv::forward(const std::vector& bottom_blobs, std::vect return 0; } -#if __riscv_vector && __riscv_zfh -int Deconvolution_riscv::create_pipeline_fp16s(const Option& opt) -{ - const int packn = csrr_vlenb() / 2; - - const int maxk = kernel_w * kernel_h; - const int num_input = weight_data_size / maxk / num_output; - - int elempack = 1; - int out_elempack = 1; - - if (opt.use_packing_layout) - { - elempack = num_input % packn == 0 ? packn : 1; - out_elempack = num_output % packn == 0 ? packn : 1; - } - - Mat weight_data_transposed(weight_data.w); - { - float* pt = weight_data_transposed; - const float* p = weight_data; - - for (int i = 0; i < num_input * num_output; i++) - { - for (int k = 0; k < maxk; k++) - { - pt[maxk - 1 - k] = p[k]; - } - - p += maxk; - pt += maxk; - } - } - - // src = kw-kh-inch-outch - // dst = pb-pa-kw-kh-inch/pa-outch/pb - { - Mat weight_data_r2 = weight_data_transposed.reshape(maxk, num_input, num_output); - - weight_data_tm.create(maxk, num_input / elempack, num_output / out_elempack, (size_t)2u * elempack * out_elempack, elempack * out_elempack); - - for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack) - { - __fp16* g00 = weight_data_tm.channel(q / out_elempack); - - for (int p = 0; p + (elempack - 1) < num_input; p += elempack) - { - for (int k = 0; k < maxk; k++) - { - for (int i = 0; i < elempack; i++) - { - for (int j = 0; j < out_elempack; j++) - { - const float* k00 = weight_data_r2.channel(q + j).row(p + i); - - g00[0] = (__fp16)k00[k]; - - g00++; - } - } - } - } - } - } - - // packn - if (elempack == packn && out_elempack == packn) - { - } - - // pack1ton - if (elempack == 1 && out_elempack == packn) - { - } - - // packnto1 - if (elempack == packn && out_elempack == 1) - { - } - - // pack1 - if (elempack == 1 && out_elempack == 1) - { - } - - ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt); - - if (opt.lightmode) - weight_data.release(); - - return 0; -} - -int Deconvolution_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const -{ - const int packn = csrr_vlenb() / 2; - - // deconvolv with NxN kernel - // value = value + bias - - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - size_t elemsize = bottom_blob.elemsize; - int elempack = bottom_blob.elempack; - - // NCNN_LOGE("Deconvolution input %d x %d pad = %d %d ksize=%d %d stride=%d %d", w, h, pad_w, pad_h, kernel_w, kernel_h, stride_w, stride_h); - - const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; - const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; - - int outw = (w - 1) * stride_w + kernel_extent_w + output_pad_right; - int outh = (h - 1) * stride_h + kernel_extent_h + output_pad_bottom; - int out_elempack = (opt.use_packing_layout && num_output % packn == 0) ? packn : 1; - size_t out_elemsize = elemsize / elempack * out_elempack; - - Mat top_blob_bordered; - if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0 || (output_w > 0 && output_h > 0)) - { - top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_allocator); - } - else - { - top_blob_bordered = top_blob; - top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); - } - if (top_blob_bordered.empty()) - return -100; - - if (elempack == packn && out_elempack == packn) - { - { - deconvolution_packn_fp16s_rvv(bottom_blob, top_blob_bordered, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); - } - } - - if (elempack == 1 && out_elempack == packn) - { - { - deconvolution_pack1ton_fp16s_rvv(bottom_blob, top_blob_bordered, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); - } - } - - if (elempack == packn && out_elempack == 1) - { - { - deconvolution_packnto1_fp16s_rvv(bottom_blob, top_blob_bordered, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); - } - } - - if (elempack == 1 && out_elempack == 1) - { - { - deconvolution_fp16s(bottom_blob, top_blob_bordered, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); - } - } - - cut_padding(top_blob_bordered, top_blob, opt); - if (top_blob.empty()) - return -100; - - return 0; -} - -int Deconvolution_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const -{ - const int packn = csrr_vlenb() / 2; - - // deconvolv with NxN kernel - // value = value + bias - - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - size_t elemsize = bottom_blob.elemsize; - int elempack = bottom_blob.elempack; - - // NCNN_LOGE("Deconvolution input %d x %d pad = %d %d ksize=%d %d stride=%d %d", w, h, pad_w, pad_h, kernel_w, kernel_h, stride_w, stride_h); - - const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; - const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; - - int outw = (w - 1) * stride_w + kernel_extent_w + output_pad_right; - int outh = (h - 1) * stride_h + kernel_extent_h + output_pad_bottom; - int out_elempack = (opt.use_packing_layout && num_output % packn == 0) ? packn : 1; - size_t out_elemsize = elemsize / elempack * out_elempack; - - Mat top_blob_bordered; - if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0 || (output_w > 0 && output_h > 0)) - { - top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_allocator); - } - else - { - top_blob_bordered = top_blob; - top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); - } - if (top_blob_bordered.empty()) - return -100; - - if (elempack == packn && out_elempack == packn) - { - { - deconvolution_packn_fp16sa_rvv(bottom_blob, top_blob_bordered, weight_data_tm, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); - } - } - - if (elempack == 1 && out_elempack == packn) - { - { - deconvolution_pack1ton_fp16sa_rvv(bottom_blob, top_blob_bordered, weight_data_tm, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); - } - } - - if (elempack == packn && out_elempack == 1) - { - { - deconvolution_packnto1_fp16sa_rvv(bottom_blob, top_blob_bordered, weight_data_tm, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); - } - } - - if (elempack == 1 && out_elempack == 1) - { - { - deconvolution_fp16s(bottom_blob, top_blob_bordered, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); - } - } - - cut_padding(top_blob_bordered, top_blob, opt); - if (top_blob.empty()) - return -100; - - return 0; -} -#endif // __riscv_vector && __riscv_zfh - } // namespace ncnn diff --git a/src/layer/riscv/deconvolution_riscv.h b/src/layer/riscv/deconvolution_riscv.h index 57d30349aad..96233bd79dc 100644 --- a/src/layer/riscv/deconvolution_riscv.h +++ b/src/layer/riscv/deconvolution_riscv.h @@ -32,7 +32,7 @@ class Deconvolution_riscv : public Deconvolution virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; protected: -#if __riscv_vector && __riscv_zfh +#if NCNN_ZFH int create_pipeline_fp16s(const Option& opt); int forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; diff --git a/src/layer/riscv/deconvolution_riscv_zfh.cpp b/src/layer/riscv/deconvolution_riscv_zfh.cpp new file mode 100644 index 00000000000..80d59a93ce4 --- /dev/null +++ b/src/layer/riscv/deconvolution_riscv_zfh.cpp @@ -0,0 +1,296 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "deconvolution_riscv.h" + +#if __riscv_vector +#include +#endif // __riscv_vector + +#include "riscv_activation.h" +#include "riscv_usability.h" + +namespace ncnn { + +#if NCNN_ZFH +#include "deconvolution_fp16s.h" +#if __riscv_zvfh +#include "deconvolution_packn_fp16s.h" +#include "deconvolution_pack1ton_fp16s.h" +#include "deconvolution_packnto1_fp16s.h" +#endif +#endif // NCNN_ZFH + +#if NCNN_ZFH +int Deconvolution_riscv::create_pipeline_fp16s(const Option& opt) +{ +#if __riscv_zvfh + const int packn = csrr_vlenb() / 2; +#endif // __riscv_zvfh + + const int maxk = kernel_w * kernel_h; + const int num_input = weight_data_size / maxk / num_output; + + int elempack = 1; + int out_elempack = 1; +#if __riscv_zvfh + if (opt.use_packing_layout) + { + elempack = num_input % packn == 0 ? packn : 1; + out_elempack = num_output % packn == 0 ? packn : 1; + } +#endif // __riscv_zvfh + + Mat weight_data_transposed(weight_data.w); + { + float* pt = weight_data_transposed; + const float* p = weight_data; + + for (int i = 0; i < num_input * num_output; i++) + { + for (int k = 0; k < maxk; k++) + { + pt[maxk - 1 - k] = p[k]; + } + + p += maxk; + pt += maxk; + } + } + + // src = kw-kh-inch-outch + // dst = pb-pa-kw-kh-inch/pa-outch/pb + { + Mat weight_data_r2 = weight_data_transposed.reshape(maxk, num_input, num_output); + + weight_data_tm.create(maxk, num_input / elempack, num_output / out_elempack, (size_t)2u * elempack * out_elempack, elempack * out_elempack); + + for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack) + { + __fp16* g00 = weight_data_tm.channel(q / out_elempack); + + for (int p = 0; p + (elempack - 1) < num_input; p += elempack) + { + for (int k = 0; k < maxk; k++) + { + for (int i = 0; i < elempack; i++) + { + for (int j = 0; j < out_elempack; j++) + { + const float* k00 = weight_data_r2.channel(q + j).row(p + i); + + g00[0] = (__fp16)k00[k]; + + g00++; + } + } + } + } + } + } + +#if __riscv_zvfh + // packn + if (elempack == packn && out_elempack == packn) + { + } + + // pack1ton + if (elempack == 1 && out_elempack == packn) + { + } + + // packnto1 + if (elempack == packn && out_elempack == 1) + { + } +#endif // __riscv_zvfh + + // pack1 + if (elempack == 1 && out_elempack == 1) + { + } + + ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt); + + if (opt.lightmode) + weight_data.release(); + + return 0; +} + +int Deconvolution_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ +#if __riscv_zvfh + const int packn = csrr_vlenb() / 2; +#endif // __riscv_zvfh + + // deconvolv with NxN kernel + // value = value + bias + + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + // NCNN_LOGE("Deconvolution input %d x %d pad = %d %d ksize=%d %d stride=%d %d", w, h, pad_w, pad_h, kernel_w, kernel_h, stride_w, stride_h); + + const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; + const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; + + int outw = (w - 1) * stride_w + kernel_extent_w + output_pad_right; + int outh = (h - 1) * stride_h + kernel_extent_h + output_pad_bottom; + int out_elempack = 1; +#if __riscv_zvfh + if (opt.use_packing_layout) + { + out_elempack = num_output % packn == 0 ? packn : 1; + } +#endif // __riscv_zvfh + size_t out_elemsize = elemsize / elempack * out_elempack; + + Mat top_blob_bordered; + if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0 || (output_w > 0 && output_h > 0)) + { + top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_allocator); + } + else + { + top_blob_bordered = top_blob; + top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + } + if (top_blob_bordered.empty()) + return -100; + +#if __riscv_zvfh + if (elempack == packn && out_elempack == packn) + { + { + deconvolution_packn_fp16s_rvv(bottom_blob, top_blob_bordered, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); + } + } + + if (elempack == 1 && out_elempack == packn) + { + { + deconvolution_pack1ton_fp16s_rvv(bottom_blob, top_blob_bordered, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); + } + } + + if (elempack == packn && out_elempack == 1) + { + { + deconvolution_packnto1_fp16s_rvv(bottom_blob, top_blob_bordered, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); + } + } +#endif // __riscv_zvfh + + if (elempack == 1 && out_elempack == 1) + { + { + deconvolution_fp16s(bottom_blob, top_blob_bordered, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); + } + } + + cut_padding(top_blob_bordered, top_blob, opt); + if (top_blob.empty()) + return -100; + + return 0; +} + +int Deconvolution_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ +#if __riscv_zvfh + const int packn = csrr_vlenb() / 2; +#endif // __riscv_zvfh + + // deconvolv with NxN kernel + // value = value + bias + + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + // NCNN_LOGE("Deconvolution input %d x %d pad = %d %d ksize=%d %d stride=%d %d", w, h, pad_w, pad_h, kernel_w, kernel_h, stride_w, stride_h); + + const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; + const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; + + int outw = (w - 1) * stride_w + kernel_extent_w + output_pad_right; + int outh = (h - 1) * stride_h + kernel_extent_h + output_pad_bottom; + int out_elempack = 1; +#if __riscv_zvfh + if (opt.use_packing_layout) + { + out_elempack = num_output % packn == 0 ? packn : 1; + } +#endif // __riscv_zvfh + size_t out_elemsize = elemsize / elempack * out_elempack; + + Mat top_blob_bordered; + if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0 || (output_w > 0 && output_h > 0)) + { + top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_allocator); + } + else + { + top_blob_bordered = top_blob; + top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + } + if (top_blob_bordered.empty()) + return -100; + +#if __riscv_zvfh + if (elempack == packn && out_elempack == packn) + { + { + deconvolution_packn_fp16sa_rvv(bottom_blob, top_blob_bordered, weight_data_tm, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); + } + } + + if (elempack == 1 && out_elempack == packn) + { + { + deconvolution_pack1ton_fp16sa_rvv(bottom_blob, top_blob_bordered, weight_data_tm, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); + } + } + + if (elempack == packn && out_elempack == 1) + { + { + deconvolution_packnto1_fp16sa_rvv(bottom_blob, top_blob_bordered, weight_data_tm, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); + } + } +#endif // __riscv_zvfh + + if (elempack == 1 && out_elempack == 1) + { + { + deconvolution_fp16s(bottom_blob, top_blob_bordered, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); + } + } + + cut_padding(top_blob_bordered, top_blob, opt); + if (top_blob.empty()) + return -100; + + return 0; +} +#endif // NCNN_ZFH + +} // namespace ncnn diff --git a/src/layer/riscv/deconvolutiondepthwise_riscv.cpp b/src/layer/riscv/deconvolutiondepthwise_riscv.cpp index 6a311680f4f..ab22becae09 100644 --- a/src/layer/riscv/deconvolutiondepthwise_riscv.cpp +++ b/src/layer/riscv/deconvolutiondepthwise_riscv.cpp @@ -14,9 +14,6 @@ #include "deconvolutiondepthwise_riscv.h" -#include "cpu.h" -#include "layer_type.h" - #if __riscv_vector #include #endif // __riscv_vector @@ -24,16 +21,23 @@ #include "riscv_activation.h" #include "riscv_usability.h" +#include "cpu.h" +#include "layer_type.h" + namespace ncnn { DeconvolutionDepthWise_riscv::DeconvolutionDepthWise_riscv() { #if __riscv_vector support_packing = true; -#if __riscv_zfh - support_fp16_storage = true; -#endif #endif // __riscv_vector +#if NCNN_ZFH +#if __riscv_vector + support_fp16_storage = cpu_support_riscv_zvfh(); +#else + support_fp16_storage = cpu_support_riscv_zfh(); +#endif +#endif } int DeconvolutionDepthWise_riscv::create_pipeline(const Option& opt) @@ -41,8 +45,8 @@ int DeconvolutionDepthWise_riscv::create_pipeline(const Option& opt) if (dynamic_weight) return 0; -#if __riscv_vector && __riscv_zfh - if (opt.use_fp16_storage) +#if NCNN_ZFH + if (support_fp16_storage && opt.use_fp16_storage) { return create_pipeline_fp16s(opt); } @@ -196,9 +200,9 @@ int DeconvolutionDepthWise_riscv::destroy_pipeline(const Option& opt) int DeconvolutionDepthWise_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const { +#if NCNN_ZFH int elembits = bottom_blob.elembits(); -#if __riscv_vector && __riscv_zfh if (opt.use_fp16_storage && elembits == 16) { if (opt.use_fp16_arithmetic) @@ -210,7 +214,7 @@ int DeconvolutionDepthWise_riscv::forward(const Mat& bottom_blob, Mat& top_blob, #if __riscv_vector const int packn = csrr_vlenb() / 4; - const size_t vl = vsetvl_e32m1(packn); + const size_t vl = __riscv_vsetvl_e32m1(packn); #endif // convolv with NxN kernel @@ -269,11 +273,11 @@ int DeconvolutionDepthWise_riscv::forward(const Mat& bottom_blob, Mat& top_blob, { for (int j = 0; j < outw; j++) { - vfloat32m1_t _sum = vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _sum = __riscv_vfmv_v_f_f32m1(0.f, vl); if (bias_term) { - _sum = vle32_v_f32m1((const float*)bias_data + g * packn, vl); + _sum = __riscv_vle32_v_f32m1((const float*)bias_data + g * packn, vl); } for (int y = 0; y < kernel_h; y++) @@ -300,15 +304,15 @@ int DeconvolutionDepthWise_riscv::forward(const Mat& bottom_blob, Mat& top_blob, int k = y * kernel_w + x; - vfloat32m1_t _val = vle32_v_f32m1(sptr, vl); - vfloat32m1_t _w = vle32_v_f32m1(kptr + k * packn, vl); - _sum = vfmacc_vv_f32m1(_sum, _val, _w, vl); + vfloat32m1_t _val = __riscv_vle32_v_f32m1(sptr, vl); + vfloat32m1_t _w = __riscv_vle32_v_f32m1(kptr + k * packn, vl); + _sum = __riscv_vfmacc_vv_f32m1(_sum, _val, _w, vl); } } _sum = activation_ps(_sum, activation_type, activation_params, vl); - vse32_v_f32m1(outptr + j * packn, _sum, vl); + __riscv_vse32_v_f32m1(outptr + j * packn, _sum, vl); } outptr += outw * packn; @@ -462,7 +466,7 @@ int DeconvolutionDepthWise_riscv::forward(const std::vector& bottom_blobs, return -100; #if NCNN_RVV - if (opt.use_fp16_storage && cpu_support_riscv_v() && cpu_support_riscv_zfh() && weight_data_flattened.elembits() == 16) + if (opt.use_fp16_storage && cpu_support_riscv_zvfh() && weight_data_flattened.elembits() == 16) { Mat weight_data_flattened_fp32; cast_float16_to_float32(weight_data_flattened, weight_data_flattened_fp32, opt); @@ -513,7 +517,7 @@ int DeconvolutionDepthWise_riscv::forward(const std::vector& bottom_blobs, return -100; #if NCNN_RVV - if (opt.use_fp16_storage && cpu_support_riscv_v() && cpu_support_riscv_zfh() && bias_data_flattened.elembits() == 16) + if (opt.use_fp16_storage && cpu_support_riscv_zvfh() && bias_data_flattened.elembits() == 16) { Mat bias_data_flattened_fp32; cast_float16_to_float32(bias_data_flattened, bias_data_flattened_fp32, opt); @@ -570,513 +574,4 @@ int DeconvolutionDepthWise_riscv::forward(const std::vector& bottom_blobs, return 0; } -#if __riscv_vector && __riscv_zfh -int DeconvolutionDepthWise_riscv::create_pipeline_fp16s(const Option& opt) -{ - const int packn = csrr_vlenb() / 2; - - const int maxk = kernel_w * kernel_h; - int channels = (weight_data_size / group) / maxk / (num_output / group) * group; - - // depth-wise - if (channels == group && group == num_output) - { - int elempack = 1; - if (opt.use_packing_layout) - { - elempack = channels % packn == 0 ? packn : 1; - } - - Mat weight_data_transposed(weight_data.w); - { - float* pt = weight_data_transposed; - const float* p = weight_data; - - for (int i = 0; i < (channels / group) * (num_output / group) * group; i++) - { - for (int k = 0; k < maxk; k++) - { - pt[maxk - 1 - k] = p[k]; - } - - p += maxk; - pt += maxk; - } - } - - // packn - if (elempack == packn) - { - Mat weight_data_r2 = weight_data_transposed.reshape(maxk, group); - Mat weight_data_r2_packed; - convert_packing(weight_data_r2, weight_data_r2_packed, packn, opt); - - ncnn::cast_float32_to_float16(weight_data_r2_packed, weight_data_tm, opt); - } - - if (elempack == 1) - { - ncnn::cast_float32_to_float16(weight_data_transposed, weight_data_tm, opt); - } - - ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt); - - if (opt.lightmode) - weight_data.release(); - - return 0; - } - - // group convolution - create_group_ops(opt); - - if (opt.lightmode) - weight_data.release(); - - return 0; -} - -int DeconvolutionDepthWise_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const -{ - const int packn = csrr_vlenb() / 2; - const size_t vl = vsetvl_e16m1(packn); - - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - size_t elemsize = bottom_blob.elemsize; - int elempack = bottom_blob.elempack; - - const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; - const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; - - int outw = (w - 1) * stride_w + kernel_extent_w + output_pad_right; - int outh = (h - 1) * stride_h + kernel_extent_h + output_pad_bottom; - int out_elempack = (opt.use_packing_layout && num_output % packn == 0) ? packn : 1; - size_t out_elemsize = elemsize / elempack * out_elempack; - - Mat top_blob_bordered; - if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0 || (output_w > 0 && output_h > 0)) - { - top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_allocator); - } - else - { - top_blob_bordered = top_blob; - top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); - } - if (top_blob_bordered.empty()) - return -100; - - const int maxk = kernel_w * kernel_h; - - // depth-wise - if (channels * elempack == group && group == num_output) - { - if (elempack == packn) - { - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int g = 0; g < channels; g++) - { - __fp16* outptr = top_blob_bordered.channel(g); - const __fp16* kptr = (const __fp16*)weight_data_tm + maxk * g * packn; - const Mat m = bottom_blob.channel(g); - - for (int i = 0; i < outh; i++) - { - for (int j = 0; j < outw; j++) - { - vfloat32m2_t _sum = vfmv_v_f_f32m2(0.f, vl); - - if (bias_term) - { - _sum = vle32_v_f32m2((const float*)bias_data + g * packn, vl); - } - - for (int y = 0; y < kernel_h; y++) - { - int sys = (i + y * dilation_h - (kernel_extent_h - 1)); - if (sys < 0 || sys % stride_h != 0) - continue; - - int sy = sys / stride_h; - if (sy >= h) - continue; - - for (int x = 0; x < kernel_w; x++) - { - int sxs = (j + x * dilation_w - (kernel_extent_w - 1)); - if (sxs < 0 || sxs % stride_w != 0) - continue; - - int sx = sxs / stride_w; - if (sx >= w) - continue; - - const __fp16* sptr = m.row(sy) + sx * packn; - - int k = y * kernel_w + x; - - vfloat16m1_t _val = vle16_v_f16m1(sptr, vl); - vfloat16m1_t _w = vle16_v_f16m1(kptr + k * packn, vl); - _sum = vfwmacc_vv_f32m2(_sum, _val, _w, vl); - } - } - - _sum = activation_ps(_sum, activation_type, activation_params, vl); - - vse16_v_f16m1(outptr + j * packn, vfncvt_f_f_w_f16m1(_sum, vl), vl); - } - - outptr += outw * packn; - } - } - } - } - - if (elempack == 1) - { - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int g = 0; g < channels; g++) - { - __fp16* outptr = top_blob_bordered.channel(g); - const __fp16* kptr = (const __fp16*)weight_data_tm + maxk * g; - const Mat m = bottom_blob.channel(g); - - for (int i = 0; i < outh; i++) - { - for (int j = 0; j < outw; j++) - { - float sum = 0.f; - - if (bias_term) - { - sum = bias_data[g]; - } - - for (int y = 0; y < kernel_h; y++) - { - int sys = (i + y * dilation_h - (kernel_extent_h - 1)); - if (sys < 0 || sys % stride_h != 0) - continue; - - int sy = sys / stride_h; - if (sy >= h) - continue; - - const __fp16* sptr = m.row(sy); - - for (int x = 0; x < kernel_w; x++) - { - int sxs = (j + x * dilation_w - (kernel_extent_w - 1)); - if (sxs < 0 || sxs % stride_w != 0) - continue; - - int sx = sxs / stride_w; - if (sx >= w) - continue; - - float val = (float)sptr[sx]; - - int k = y * kernel_w + x; - - float w = (float)kptr[k]; - - sum += val * w; - } - } - - sum = activation_ss(sum, activation_type, activation_params); - - outptr[j] = (__fp16)sum; - } - - outptr += outw; - } - } - } - } - } - else - { - // group deconvolution - const int channels_g = channels * elempack / group; - const int num_output_g = num_output / group; - - int g_elempack = (opt.use_packing_layout && channels_g % packn == 0) ? packn : 1; - int out_g_elempack = (opt.use_packing_layout && num_output_g % packn == 0) ? packn : 1; - - // unpacking - Mat bottom_blob_unpacked = bottom_blob; - if (elempack > g_elempack) - { - Option opt_p = opt; - opt_p.blob_allocator = opt.workspace_allocator; - convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_p); - } - - Mat top_blob_bordered_unpacked = top_blob_bordered; - if (out_g_elempack < out_elempack) - { - top_blob_bordered_unpacked.create(outw, outh, num_output, out_elemsize / out_elempack, 1, opt.workspace_allocator); - if (top_blob_bordered_unpacked.empty()) - return -100; - } - - for (int g = 0; g < group; g++) - { - const Mat bottom_blob_g = bottom_blob_unpacked.channel_range(channels_g * g / g_elempack, channels_g / g_elempack); - Mat top_blob_bordered_g = top_blob_bordered_unpacked.channel_range(num_output_g * g / out_g_elempack, num_output_g / out_g_elempack); - - const ncnn::Layer* op = group_ops[g]; - - Option opt_g = opt; - opt_g.blob_allocator = top_blob_bordered_unpacked.allocator; - - // forward - op->forward(bottom_blob_g, top_blob_bordered_g, opt_g); - } - - // packing - if (out_g_elempack < out_elempack) - { - convert_packing(top_blob_bordered_unpacked, top_blob_bordered, out_elempack, opt); - } - else - { - top_blob_bordered = top_blob_bordered_unpacked; - } - } - - cut_padding(top_blob_bordered, top_blob, opt); - if (top_blob.empty()) - return -100; - - return 0; -} - -int DeconvolutionDepthWise_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const -{ - const int packn = csrr_vlenb() / 2; - const size_t vl = vsetvl_e16m1(packn); - - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - size_t elemsize = bottom_blob.elemsize; - int elempack = bottom_blob.elempack; - - const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; - const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; - - int outw = (w - 1) * stride_w + kernel_extent_w + output_pad_right; - int outh = (h - 1) * stride_h + kernel_extent_h + output_pad_bottom; - int out_elempack = (opt.use_packing_layout && num_output % packn == 0) ? packn : 1; - size_t out_elemsize = elemsize / elempack * out_elempack; - - Mat top_blob_bordered; - if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0 || (output_w > 0 && output_h > 0)) - { - top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_allocator); - } - else - { - top_blob_bordered = top_blob; - top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); - } - if (top_blob_bordered.empty()) - return -100; - - const int maxk = kernel_w * kernel_h; - - // depth-wise - if (channels * elempack == group && group == num_output) - { - if (elempack == packn) - { - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int g = 0; g < channels; g++) - { - __fp16* outptr = top_blob_bordered.channel(g); - const __fp16* kptr = (const __fp16*)weight_data_tm + maxk * g * packn; - const Mat m = bottom_blob.channel(g); - - for (int i = 0; i < outh; i++) - { - for (int j = 0; j < outw; j++) - { - vfloat16m1_t _sum = vfmv_v_f_f16m1((__fp16)0.f, vl); - - if (bias_term) - { - _sum = vle16_v_f16m1((const __fp16*)bias_data_fp16 + g * packn, vl); - } - - for (int y = 0; y < kernel_h; y++) - { - int sys = (i + y * dilation_h - (kernel_extent_h - 1)); - if (sys < 0 || sys % stride_h != 0) - continue; - - int sy = sys / stride_h; - if (sy >= h) - continue; - - for (int x = 0; x < kernel_w; x++) - { - int sxs = (j + x * dilation_w - (kernel_extent_w - 1)); - if (sxs < 0 || sxs % stride_w != 0) - continue; - - int sx = sxs / stride_w; - if (sx >= w) - continue; - - const __fp16* sptr = m.row(sy) + sx * packn; - - int k = y * kernel_w + x; - - vfloat16m1_t _val = vle16_v_f16m1(sptr, vl); - vfloat16m1_t _w = vle16_v_f16m1(kptr + k * packn, vl); - _sum = vfmacc_vv_f16m1(_sum, _val, _w, vl); - } - } - - _sum = activation_ps(_sum, activation_type, activation_params, vl); - - vse16_v_f16m1(outptr + j * packn, _sum, vl); - } - - outptr += outw * packn; - } - } - } - } - - if (elempack == 1) - { - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int g = 0; g < channels; g++) - { - __fp16* outptr = top_blob_bordered.channel(g); - const __fp16* kptr = (const __fp16*)weight_data_tm + maxk * g; - const Mat m = bottom_blob.channel(g); - - for (int i = 0; i < outh; i++) - { - for (int j = 0; j < outw; j++) - { - float sum = 0.f; - - if (bias_term) - { - sum = bias_data[g]; - } - - for (int y = 0; y < kernel_h; y++) - { - int sys = (i + y * dilation_h - (kernel_extent_h - 1)); - if (sys < 0 || sys % stride_h != 0) - continue; - - int sy = sys / stride_h; - if (sy >= h) - continue; - - const __fp16* sptr = m.row(sy); - - for (int x = 0; x < kernel_w; x++) - { - int sxs = (j + x * dilation_w - (kernel_extent_w - 1)); - if (sxs < 0 || sxs % stride_w != 0) - continue; - - int sx = sxs / stride_w; - if (sx >= w) - continue; - - __fp16 val = sptr[sx]; - - int k = y * kernel_w + x; - - __fp16 w = kptr[k]; - - sum += val * w; - } - } - - sum = activation_ss(sum, activation_type, activation_params); - - outptr[j] = (__fp16)sum; - } - - outptr += outw; - } - } - } - } - } - else - { - // group deconvolution - const int channels_g = channels * elempack / group; - const int num_output_g = num_output / group; - - int g_elempack = (opt.use_packing_layout && channels_g % packn == 0) ? packn : 1; - int out_g_elempack = (opt.use_packing_layout && num_output_g % packn == 0) ? packn : 1; - - // unpacking - Mat bottom_blob_unpacked = bottom_blob; - if (elempack > g_elempack) - { - Option opt_p = opt; - opt_p.blob_allocator = opt.workspace_allocator; - convert_packing(bottom_blob, bottom_blob_unpacked, g_elempack, opt_p); - } - - Mat top_blob_bordered_unpacked = top_blob_bordered; - if (out_g_elempack < out_elempack) - { - top_blob_bordered_unpacked.create(outw, outh, num_output / out_g_elempack, out_elemsize / out_elempack * out_g_elempack, out_g_elempack, opt.workspace_allocator); - if (top_blob_bordered_unpacked.empty()) - return -100; - } - - for (int g = 0; g < group; g++) - { - const Mat bottom_blob_g = bottom_blob_unpacked.channel_range(channels_g * g / g_elempack, channels_g / g_elempack); - Mat top_blob_bordered_g = top_blob_bordered_unpacked.channel_range(num_output_g * g / out_g_elempack, num_output_g / out_g_elempack); - - const ncnn::Layer* op = group_ops[g]; - - Option opt_g = opt; - opt_g.blob_allocator = top_blob_bordered_unpacked.allocator; - - // forward - op->forward(bottom_blob_g, top_blob_bordered_g, opt_g); - } - - // packing - if (out_g_elempack < out_elempack) - { - convert_packing(top_blob_bordered_unpacked, top_blob_bordered, out_elempack, opt); - } - else - { - top_blob_bordered = top_blob_bordered_unpacked; - } - } - - cut_padding(top_blob_bordered, top_blob, opt); - if (top_blob.empty()) - return -100; - - return 0; -} -#endif // __riscv_vector && __riscv_zfh - } // namespace ncnn diff --git a/src/layer/riscv/deconvolutiondepthwise_riscv.h b/src/layer/riscv/deconvolutiondepthwise_riscv.h index b0c8f7b0119..a965b5a4699 100644 --- a/src/layer/riscv/deconvolutiondepthwise_riscv.h +++ b/src/layer/riscv/deconvolutiondepthwise_riscv.h @@ -33,7 +33,7 @@ class DeconvolutionDepthWise_riscv : public DeconvolutionDepthWise protected: int create_group_ops(const Option& opt); -#if __riscv_vector && __riscv_zfh +#if NCNN_ZFH int create_pipeline_fp16s(const Option& opt); int forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; diff --git a/src/layer/riscv/deconvolutiondepthwise_riscv_zfh.cpp b/src/layer/riscv/deconvolutiondepthwise_riscv_zfh.cpp new file mode 100644 index 00000000000..69f9d5b4cdc --- /dev/null +++ b/src/layer/riscv/deconvolutiondepthwise_riscv_zfh.cpp @@ -0,0 +1,575 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "deconvolutiondepthwise_riscv.h" + +#if __riscv_vector +#include +#endif // __riscv_vector + +#include "riscv_activation.h" +#include "riscv_usability.h" + +namespace ncnn { + +#if NCNN_ZFH +int DeconvolutionDepthWise_riscv::create_pipeline_fp16s(const Option& opt) +{ +#if __riscv_zvfh + const int packn = csrr_vlenb() / 2; +#endif // __riscv_zvfh + + const int maxk = kernel_w * kernel_h; + int channels = (weight_data_size / group) / maxk / (num_output / group) * group; + + // depth-wise + if (channels == group && group == num_output) + { + int elempack = 1; +#if __riscv_zvfh + if (opt.use_packing_layout) + { + elempack = channels % packn == 0 ? packn : 1; + } +#endif // __riscv_zvfh + + Mat weight_data_transposed(weight_data.w); + { + float* pt = weight_data_transposed; + const float* p = weight_data; + + for (int i = 0; i < (channels / group) * (num_output / group) * group; i++) + { + for (int k = 0; k < maxk; k++) + { + pt[maxk - 1 - k] = p[k]; + } + + p += maxk; + pt += maxk; + } + } + +#if __riscv_zvfh + // packn + if (elempack == packn) + { + Mat weight_data_r2 = weight_data_transposed.reshape(maxk, group); + Mat weight_data_r2_packed; + convert_packing(weight_data_r2, weight_data_r2_packed, packn, opt); + + ncnn::cast_float32_to_float16(weight_data_r2_packed, weight_data_tm, opt); + } +#endif // __riscv_zvfh + + if (elempack == 1) + { + ncnn::cast_float32_to_float16(weight_data_transposed, weight_data_tm, opt); + } + + ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt); + + if (opt.lightmode) + weight_data.release(); + + return 0; + } + + // group convolution + create_group_ops(opt); + + if (opt.lightmode) + weight_data.release(); + + return 0; +} + +int DeconvolutionDepthWise_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ +#if __riscv_zvfh + const int packn = csrr_vlenb() / 2; + const size_t vl = __riscv_vsetvl_e16m1(packn); +#endif // __riscv_zvfh + + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; + const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; + + int outw = (w - 1) * stride_w + kernel_extent_w + output_pad_right; + int outh = (h - 1) * stride_h + kernel_extent_h + output_pad_bottom; + int out_elempack = 1; +#if __riscv_zvfh + if (opt.use_packing_layout) + { + out_elempack = num_output % packn == 0 ? packn : 1; + } +#endif // __riscv_zvfh + size_t out_elemsize = elemsize / elempack * out_elempack; + + Mat top_blob_bordered; + if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0 || (output_w > 0 && output_h > 0)) + { + top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_allocator); + } + else + { + top_blob_bordered = top_blob; + top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + } + if (top_blob_bordered.empty()) + return -100; + + const int maxk = kernel_w * kernel_h; + + // depth-wise + if (channels * elempack == group && group == num_output) + { +#if __riscv_zvfh + if (elempack == packn) + { + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int g = 0; g < channels; g++) + { + __fp16* outptr = top_blob_bordered.channel(g); + const __fp16* kptr = (const __fp16*)weight_data_tm + maxk * g * packn; + const Mat m = bottom_blob.channel(g); + + for (int i = 0; i < outh; i++) + { + for (int j = 0; j < outw; j++) + { + vfloat32m2_t _sum = __riscv_vfmv_v_f_f32m2(0.f, vl); + + if (bias_term) + { + _sum = __riscv_vle32_v_f32m2((const float*)bias_data + g * packn, vl); + } + + for (int y = 0; y < kernel_h; y++) + { + int sys = (i + y * dilation_h - (kernel_extent_h - 1)); + if (sys < 0 || sys % stride_h != 0) + continue; + + int sy = sys / stride_h; + if (sy >= h) + continue; + + for (int x = 0; x < kernel_w; x++) + { + int sxs = (j + x * dilation_w - (kernel_extent_w - 1)); + if (sxs < 0 || sxs % stride_w != 0) + continue; + + int sx = sxs / stride_w; + if (sx >= w) + continue; + + const __fp16* sptr = m.row(sy) + sx * packn; + + int k = y * kernel_w + x; + + vfloat16m1_t _val = __riscv_vle16_v_f16m1(sptr, vl); + vfloat16m1_t _w = __riscv_vle16_v_f16m1(kptr + k * packn, vl); + _sum = __riscv_vfwmacc_vv_f32m2(_sum, _val, _w, vl); + } + } + + _sum = activation_ps(_sum, activation_type, activation_params, vl); + + __riscv_vse16_v_f16m1(outptr + j * packn, __riscv_vfncvt_f_f_w_f16m1(_sum, vl), vl); + } + + outptr += outw * packn; + } + } + } + } +#endif // __riscv_zvfh + + if (elempack == 1) + { + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int g = 0; g < channels; g++) + { + __fp16* outptr = top_blob_bordered.channel(g); + const __fp16* kptr = (const __fp16*)weight_data_tm + maxk * g; + const Mat m = bottom_blob.channel(g); + + for (int i = 0; i < outh; i++) + { + for (int j = 0; j < outw; j++) + { + float sum = 0.f; + + if (bias_term) + { + sum = bias_data[g]; + } + + for (int y = 0; y < kernel_h; y++) + { + int sys = (i + y * dilation_h - (kernel_extent_h - 1)); + if (sys < 0 || sys % stride_h != 0) + continue; + + int sy = sys / stride_h; + if (sy >= h) + continue; + + const __fp16* sptr = m.row(sy); + + for (int x = 0; x < kernel_w; x++) + { + int sxs = (j + x * dilation_w - (kernel_extent_w - 1)); + if (sxs < 0 || sxs % stride_w != 0) + continue; + + int sx = sxs / stride_w; + if (sx >= w) + continue; + + float val = (float)sptr[sx]; + + int k = y * kernel_w + x; + + float w = (float)kptr[k]; + + sum += val * w; + } + } + + sum = activation_ss(sum, activation_type, activation_params); + + outptr[j] = (__fp16)sum; + } + + outptr += outw; + } + } + } + } + } + else + { + // group deconvolution + const int channels_g = channels * elempack / group; + const int num_output_g = num_output / group; + + int g_elempack = 1; + int out_g_elempack = 1; +#if __riscv_zvfh + if (opt.use_packing_layout) + { + g_elempack = channels_g % packn == 0 ? packn : 1; + out_g_elempack = num_output_g % packn == 0 ? packn : 1; + } +#endif // __riscv_zvfh + + // unpacking + Mat bottom_blob_unpacked = bottom_blob; + if (elempack > g_elempack) + { + Option opt_p = opt; + opt_p.blob_allocator = opt.workspace_allocator; + convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_p); + } + + Mat top_blob_bordered_unpacked = top_blob_bordered; + if (out_g_elempack < out_elempack) + { + top_blob_bordered_unpacked.create(outw, outh, num_output, out_elemsize / out_elempack, 1, opt.workspace_allocator); + if (top_blob_bordered_unpacked.empty()) + return -100; + } + + for (int g = 0; g < group; g++) + { + const Mat bottom_blob_g = bottom_blob_unpacked.channel_range(channels_g * g / g_elempack, channels_g / g_elempack); + Mat top_blob_bordered_g = top_blob_bordered_unpacked.channel_range(num_output_g * g / out_g_elempack, num_output_g / out_g_elempack); + + const ncnn::Layer* op = group_ops[g]; + + Option opt_g = opt; + opt_g.blob_allocator = top_blob_bordered_unpacked.allocator; + + // forward + op->forward(bottom_blob_g, top_blob_bordered_g, opt_g); + } + + // packing + if (out_g_elempack < out_elempack) + { + convert_packing(top_blob_bordered_unpacked, top_blob_bordered, out_elempack, opt); + } + else + { + top_blob_bordered = top_blob_bordered_unpacked; + } + } + + cut_padding(top_blob_bordered, top_blob, opt); + if (top_blob.empty()) + return -100; + + return 0; +} + +int DeconvolutionDepthWise_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ +#if __riscv_zvfh + const int packn = csrr_vlenb() / 2; + const size_t vl = __riscv_vsetvl_e16m1(packn); +#endif // __riscv_zvfh + + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; + const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; + + int outw = (w - 1) * stride_w + kernel_extent_w + output_pad_right; + int outh = (h - 1) * stride_h + kernel_extent_h + output_pad_bottom; + int out_elempack = 1; +#if __riscv_zvfh + if (opt.use_packing_layout) + { + out_elempack = num_output % packn == 0 ? packn : 1; + } +#endif // __riscv_zvfh + size_t out_elemsize = elemsize / elempack * out_elempack; + + Mat top_blob_bordered; + if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0 || (output_w > 0 && output_h > 0)) + { + top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_allocator); + } + else + { + top_blob_bordered = top_blob; + top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + } + if (top_blob_bordered.empty()) + return -100; + + const int maxk = kernel_w * kernel_h; + + // depth-wise + if (channels * elempack == group && group == num_output) + { +#if __riscv_zvfh + if (elempack == packn) + { + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int g = 0; g < channels; g++) + { + __fp16* outptr = top_blob_bordered.channel(g); + const __fp16* kptr = (const __fp16*)weight_data_tm + maxk * g * packn; + const Mat m = bottom_blob.channel(g); + + for (int i = 0; i < outh; i++) + { + for (int j = 0; j < outw; j++) + { + vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl); + + if (bias_term) + { + _sum = __riscv_vle16_v_f16m1((const __fp16*)bias_data_fp16 + g * packn, vl); + } + + for (int y = 0; y < kernel_h; y++) + { + int sys = (i + y * dilation_h - (kernel_extent_h - 1)); + if (sys < 0 || sys % stride_h != 0) + continue; + + int sy = sys / stride_h; + if (sy >= h) + continue; + + for (int x = 0; x < kernel_w; x++) + { + int sxs = (j + x * dilation_w - (kernel_extent_w - 1)); + if (sxs < 0 || sxs % stride_w != 0) + continue; + + int sx = sxs / stride_w; + if (sx >= w) + continue; + + const __fp16* sptr = m.row(sy) + sx * packn; + + int k = y * kernel_w + x; + + vfloat16m1_t _val = __riscv_vle16_v_f16m1(sptr, vl); + vfloat16m1_t _w = __riscv_vle16_v_f16m1(kptr + k * packn, vl); + _sum = __riscv_vfmacc_vv_f16m1(_sum, _val, _w, vl); + } + } + + _sum = activation_ps(_sum, activation_type, activation_params, vl); + + __riscv_vse16_v_f16m1(outptr + j * packn, _sum, vl); + } + + outptr += outw * packn; + } + } + } + } +#endif // __riscv_zvfh + + if (elempack == 1) + { + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int g = 0; g < channels; g++) + { + __fp16* outptr = top_blob_bordered.channel(g); + const __fp16* kptr = (const __fp16*)weight_data_tm + maxk * g; + const Mat m = bottom_blob.channel(g); + + for (int i = 0; i < outh; i++) + { + for (int j = 0; j < outw; j++) + { + float sum = 0.f; + + if (bias_term) + { + sum = bias_data[g]; + } + + for (int y = 0; y < kernel_h; y++) + { + int sys = (i + y * dilation_h - (kernel_extent_h - 1)); + if (sys < 0 || sys % stride_h != 0) + continue; + + int sy = sys / stride_h; + if (sy >= h) + continue; + + const __fp16* sptr = m.row(sy); + + for (int x = 0; x < kernel_w; x++) + { + int sxs = (j + x * dilation_w - (kernel_extent_w - 1)); + if (sxs < 0 || sxs % stride_w != 0) + continue; + + int sx = sxs / stride_w; + if (sx >= w) + continue; + + __fp16 val = sptr[sx]; + + int k = y * kernel_w + x; + + __fp16 w = kptr[k]; + + sum += val * w; + } + } + + sum = activation_ss(sum, activation_type, activation_params); + + outptr[j] = (__fp16)sum; + } + + outptr += outw; + } + } + } + } + } + else + { + // group deconvolution + const int channels_g = channels * elempack / group; + const int num_output_g = num_output / group; + + int g_elempack = 1; + int out_g_elempack = 1; +#if __riscv_zvfh + if (opt.use_packing_layout) + { + g_elempack = channels_g % packn == 0 ? packn : 1; + out_g_elempack = num_output_g % packn == 0 ? packn : 1; + } +#endif // __riscv_zvfh + + // unpacking + Mat bottom_blob_unpacked = bottom_blob; + if (elempack > g_elempack) + { + Option opt_p = opt; + opt_p.blob_allocator = opt.workspace_allocator; + convert_packing(bottom_blob, bottom_blob_unpacked, g_elempack, opt_p); + } + + Mat top_blob_bordered_unpacked = top_blob_bordered; + if (out_g_elempack < out_elempack) + { + top_blob_bordered_unpacked.create(outw, outh, num_output / out_g_elempack, out_elemsize / out_elempack * out_g_elempack, out_g_elempack, opt.workspace_allocator); + if (top_blob_bordered_unpacked.empty()) + return -100; + } + + for (int g = 0; g < group; g++) + { + const Mat bottom_blob_g = bottom_blob_unpacked.channel_range(channels_g * g / g_elempack, channels_g / g_elempack); + Mat top_blob_bordered_g = top_blob_bordered_unpacked.channel_range(num_output_g * g / out_g_elempack, num_output_g / out_g_elempack); + + const ncnn::Layer* op = group_ops[g]; + + Option opt_g = opt; + opt_g.blob_allocator = top_blob_bordered_unpacked.allocator; + + // forward + op->forward(bottom_blob_g, top_blob_bordered_g, opt_g); + } + + // packing + if (out_g_elempack < out_elempack) + { + convert_packing(top_blob_bordered_unpacked, top_blob_bordered, out_elempack, opt); + } + else + { + top_blob_bordered = top_blob_bordered_unpacked; + } + } + + cut_padding(top_blob_bordered, top_blob, opt); + if (top_blob.empty()) + return -100; + + return 0; +} +#endif // NCNN_ZFH + +} // namespace ncnn diff --git a/src/layer/riscv/dropout_riscv.cpp b/src/layer/riscv/dropout_riscv.cpp index 461edf2d056..4a1ddc0cfd0 100644 --- a/src/layer/riscv/dropout_riscv.cpp +++ b/src/layer/riscv/dropout_riscv.cpp @@ -53,11 +53,11 @@ int Dropout_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons int n = size; while (n > 0) { - size_t vl = vsetvl_e32m8(n); + size_t vl = __riscv_vsetvl_e32m8(n); - vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); - _p = vfmul_vf_f32m8(_p, scale, vl); - vse32_v_f32m8(ptr, _p, vl); + vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr, vl); + _p = __riscv_vfmul_vf_f32m8(_p, scale, vl); + __riscv_vse32_v_f32m8(ptr, _p, vl); ptr += vl; n -= vl; diff --git a/src/layer/riscv/flatten_riscv.cpp b/src/layer/riscv/flatten_riscv.cpp index 491c051c7fe..baf4e11fe8a 100644 --- a/src/layer/riscv/flatten_riscv.cpp +++ b/src/layer/riscv/flatten_riscv.cpp @@ -20,16 +20,22 @@ #include "riscv_usability.h" +#include "cpu.h" + namespace ncnn { Flatten_riscv::Flatten_riscv() { #if __riscv_vector support_packing = true; -#if __riscv_zfh - support_fp16_storage = true; -#endif #endif // __riscv_vector +#if NCNN_ZFH +#if __riscv_vector + support_fp16_storage = cpu_support_riscv_zvfh(); +#else + support_fp16_storage = cpu_support_riscv_zfh(); +#endif +#endif #if NCNN_BF16 support_bf16_storage = true; @@ -43,7 +49,7 @@ int Flatten_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& if (elembits == 8) return forward_int8(bottom_blob, top_blob, opt); -#if __riscv_vector && __riscv_zfh +#if NCNN_ZFH if (opt.use_fp16_storage && elembits == 16) return forward_bf16s_fp16s(bottom_blob, top_blob, opt); #endif @@ -119,10 +125,10 @@ int Flatten_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& int n = w * elempack; while (n > 0) { - size_t vl = vsetvl_e32m1(n); + size_t vl = __riscv_vsetvl_e32m1(n); - vfloat32m1_t _p = vle32_v_f32m1(ptr, vl); - vsse32_v_f32m1(outptr, w * sizeof(float), _p, vl); + vfloat32m1_t _p = __riscv_vle32_v_f32m1(ptr, vl); + __riscv_vsse32_v_f32m1(outptr, w * sizeof(float), _p, vl); ptr += vl; outptr += 1; @@ -147,10 +153,10 @@ int Flatten_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& int n = size * elempack; while (n > 0) { - size_t vl = vsetvl_e32m1(n); + size_t vl = __riscv_vsetvl_e32m1(n); - vfloat32m1_t _p = vle32_v_f32m1(ptr, vl); - vsse32_v_f32m1(outptr, size * sizeof(float), _p, vl); + vfloat32m1_t _p = __riscv_vle32_v_f32m1(ptr, vl); + __riscv_vsse32_v_f32m1(outptr, size * sizeof(float), _p, vl); ptr += vl; outptr += 1; @@ -172,10 +178,10 @@ int Flatten_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& int n = size * elempack; while (n > 0) { - size_t vl = vsetvl_e32m8(n); + size_t vl = __riscv_vsetvl_e32m8(n); - vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); - vse32_v_f32m8(outptr, _p, vl); + vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr, vl); + __riscv_vse32_v_f32m8(outptr, _p, vl); ptr += vl; outptr += vl; @@ -262,10 +268,10 @@ int Flatten_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co int n = w * elempack; while (n > 0) { - size_t vl = vsetvl_e16m1(n); + size_t vl = __riscv_vsetvl_e16m1(n); - vuint16m1_t _p = vle16_v_u16m1(ptr, vl); - vsse16_v_u16m1(outptr, w * sizeof(unsigned short), _p, vl); + vuint16m1_t _p = __riscv_vle16_v_u16m1(ptr, vl); + __riscv_vsse16_v_u16m1(outptr, w * sizeof(unsigned short), _p, vl); ptr += vl; outptr += 1; @@ -290,10 +296,10 @@ int Flatten_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co int n = size * elempack; while (n > 0) { - size_t vl = vsetvl_e16m1(n); + size_t vl = __riscv_vsetvl_e16m1(n); - vuint16m1_t _p = vle16_v_u16m1(ptr, vl); - vsse16_v_u16m1(outptr, size * sizeof(unsigned short), _p, vl); + vuint16m1_t _p = __riscv_vle16_v_u16m1(ptr, vl); + __riscv_vsse16_v_u16m1(outptr, size * sizeof(unsigned short), _p, vl); ptr += vl; outptr += 1; @@ -315,10 +321,10 @@ int Flatten_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co int n = size * elempack; while (n > 0) { - size_t vl = vsetvl_e16m8(n); + size_t vl = __riscv_vsetvl_e16m8(n); - vuint16m8_t _p = vle16_v_u16m8(ptr, vl); - vse16_v_u16m8(outptr, _p, vl); + vuint16m8_t _p = __riscv_vle16_v_u16m8(ptr, vl); + __riscv_vse16_v_u16m8(outptr, _p, vl); ptr += vl; outptr += vl; @@ -405,10 +411,10 @@ int Flatten_riscv::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Opt int n = w * elempack; while (n > 0) { - size_t vl = vsetvl_e8m1(n); + size_t vl = __riscv_vsetvl_e8m1(n); - vint8m1_t _p = vle8_v_i8m1(ptr, vl); - vsse8_v_i8m1(outptr, w * sizeof(unsigned char), _p, vl); + vint8m1_t _p = __riscv_vle8_v_i8m1(ptr, vl); + __riscv_vsse8_v_i8m1(outptr, w * sizeof(unsigned char), _p, vl); ptr += vl; outptr += 1; @@ -433,10 +439,10 @@ int Flatten_riscv::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Opt int n = size * elempack; while (n > 0) { - size_t vl = vsetvl_e8m1(n); + size_t vl = __riscv_vsetvl_e8m1(n); - vint8m1_t _p = vle8_v_i8m1(ptr, vl); - vsse8_v_i8m1(outptr, size * sizeof(signed char), _p, vl); + vint8m1_t _p = __riscv_vle8_v_i8m1(ptr, vl); + __riscv_vsse8_v_i8m1(outptr, size * sizeof(signed char), _p, vl); ptr += vl; outptr += 1; @@ -458,10 +464,10 @@ int Flatten_riscv::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Opt int n = size * elempack; while (n > 0) { - size_t vl = vsetvl_e8m8(n); + size_t vl = __riscv_vsetvl_e8m8(n); - vint8m8_t _p = vle8_v_i8m8(ptr, vl); - vse8_v_i8m8(outptr, _p, vl); + vint8m8_t _p = __riscv_vle8_v_i8m8(ptr, vl); + __riscv_vse8_v_i8m8(outptr, _p, vl); ptr += vl; outptr += vl; diff --git a/src/layer/riscv/gelu_riscv.cpp b/src/layer/riscv/gelu_riscv.cpp index 69b374998f3..34b29212669 100644 --- a/src/layer/riscv/gelu_riscv.cpp +++ b/src/layer/riscv/gelu_riscv.cpp @@ -48,20 +48,20 @@ int GELU_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const int n = size; while (n > 0) { - size_t vl = vsetvl_e32m4(n); + size_t vl = __riscv_vsetvl_e32m4(n); - vfloat32m4_t _p = vle32_v_f32m4(ptr, vl); + vfloat32m4_t _p = __riscv_vle32_v_f32m4(ptr, vl); - vfloat32m4_t _arg = vfmul_vf_f32m4( - vfmul_vv_f32m4(vfmul_vv_f32m4(_p, _p, vl), _p, vl), 0.044715f, vl); + vfloat32m4_t _arg = __riscv_vfmul_vf_f32m4( + __riscv_vfmul_vv_f32m4(__riscv_vfmul_vv_f32m4(_p, _p, vl), _p, vl), 0.044715f, vl); - _arg = vfadd_vv_f32m4(_p, _arg, vl); - _arg = vfmul_vf_f32m4(_arg, 0.79788452f, vl); + _arg = __riscv_vfadd_vv_f32m4(_p, _arg, vl); + _arg = __riscv_vfmul_vf_f32m4(_arg, 0.79788452f, vl); vfloat32m4_t _tanharg = tanh_ps(_arg, vl); - _p = vfmul_vf_f32m4( - vfmul_vv_f32m4(_p, vfadd_vf_f32m4(_tanharg, 1.f, vl), vl), .5f, vl); + _p = __riscv_vfmul_vf_f32m4( + __riscv_vfmul_vv_f32m4(_p, __riscv_vfadd_vf_f32m4(_tanharg, 1.f, vl), vl), .5f, vl); - vse32_v_f32m4(ptr, _p, vl); + __riscv_vse32_v_f32m4(ptr, _p, vl); n -= vl; ptr += vl; } @@ -74,23 +74,32 @@ int GELU_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const { float* ptr = bottom_top_blob.channel(q); +#if C906 + // FIXME -O3 leads illegal instruction + for (int i = 0; i < size; i++) + { + // y = x * P(X <= x) where X ~ N(0, 1) + ptr[i] = 0.5f * ptr[i] * erfcf(-0.70710678f * ptr[i]); + } +#else int n = size; while (n > 0) { - size_t vl = vsetvl_e32m8(n); - auto _p = vle32_v_f32m8(ptr, vl); - auto _perfc = vfmul_vf_f32m8(_p, -.70710678f, vl); - _p = vfmul_vf_f32m8(_p, .5f, vl); + size_t vl = __riscv_vsetvl_e32m8(n); + auto _p = __riscv_vle32_v_f32m8(ptr, vl); + auto _perfc = __riscv_vfmul_vf_f32m8(_p, -.70710678f, vl); + _p = __riscv_vfmul_vf_f32m8(_p, .5f, vl); // y = x * P(X <= x) where X ~ N(0, 1) _perfc = erfc_ps(_perfc, vl); - _p = vfmul_vv_f32m8(_p, _perfc, vl); - vse32_v_f32m8(ptr, _p, vl); + _p = __riscv_vfmul_vv_f32m8(_p, _perfc, vl); + __riscv_vse32_v_f32m8(ptr, _p, vl); n -= vl; ptr += vl; } +#endif } } diff --git a/src/layer/riscv/gemm_riscv.cpp b/src/layer/riscv/gemm_riscv.cpp index 8dee572548e..0c9862d55d0 100644 --- a/src/layer/riscv/gemm_riscv.cpp +++ b/src/layer/riscv/gemm_riscv.cpp @@ -33,153 +33,46 @@ Gemm_riscv::Gemm_riscv() support_inplace = false; nT = 0; -#if __riscv_vector - // When processing float data, - // even if the current hardware provides vector registers of more than 128 bits, - // vl=4 is still used, even though this will waste the width of the vector register. - vl = vsetvlmax_e32m1(); - vl = vl >= 4 ? 4 : vl; -#else - vl = 0; -#endif // __riscv_vector } -static void pack_A_tile(const Mat& A, Mat& AT, int i, int max_ii, int k, int max_kk, size_t vl) +static void pack_A_tile(const Mat& A, Mat& AT, int i, int max_ii, int k, int max_kk) { +#if __riscv_vector + const int packn = csrr_vlenb() / 4; + const size_t vl = __riscv_vsetvl_e32m1(packn); +#endif + const int elempack = A.elempack; const int A_hstep = A.dims == 3 ? (int)A.cstep : A.w; + // NCNN_LOGE("pack_A_tile %d", elempack); + float* pp = AT; int ii = 0; #if __riscv_vector - for (; ii + 7 < max_ii; ii += 8) + for (; ii + (packn - 1) < max_ii; ii += packn) { - if (elempack == 4) + if (elempack == packn) { - const float* p0 = (const float*)A + (i + ii) * A_hstep + k * 4; - const float* p1 = (const float*)A + (i + ii + 4) * A_hstep + k * 4; + const float* p0 = (const float*)A + (i + ii) * A_hstep + k * packn; for (int kk = 0; kk < max_kk; kk++) { - vse32_v_f32m1(pp, vle32_v_f32m1(p0, vl), vl); - vse32_v_f32m1(pp + 4, vle32_v_f32m1(p1, vl), vl); - pp += 8; - p0 += 4; - p1 += 4; + __riscv_vse32_v_f32m1(pp, __riscv_vle32_v_f32m1(p0, vl), vl); + pp += packn; + p0 += packn; } } if (elempack == 1) { const float* p0 = (const float*)A + (i + ii) * A_hstep + k; - const float* p1 = (const float*)A + (i + ii + 1) * A_hstep + k; - const float* p2 = (const float*)A + (i + ii + 2) * A_hstep + k; - const float* p3 = (const float*)A + (i + ii + 3) * A_hstep + k; - const float* p4 = (const float*)A + (i + ii + 4) * A_hstep + k; - const float* p5 = (const float*)A + (i + ii + 5) * A_hstep + k; - const float* p6 = (const float*)A + (i + ii + 6) * A_hstep + k; - const float* p7 = (const float*)A + (i + ii + 7) * A_hstep + k; - - int kk = 0; - for (; kk + 7 < max_kk; kk += 8) - { - vfloat32m1_t _r0l = vle32_v_f32m1(p0, vl); - vfloat32m1_t _r0h = vle32_v_f32m1(p0 + 4, vl); - vfloat32m1_t _r1l = vle32_v_f32m1(p1, vl); - vfloat32m1_t _r1h = vle32_v_f32m1(p1 + 4, vl); - vfloat32m1_t _r2l = vle32_v_f32m1(p2, vl); - vfloat32m1_t _r2h = vle32_v_f32m1(p2 + 4, vl); - vfloat32m1_t _r3l = vle32_v_f32m1(p3, vl); - vfloat32m1_t _r3h = vle32_v_f32m1(p3 + 4, vl); - vfloat32m1_t _r4l = vle32_v_f32m1(p4, vl); - vfloat32m1_t _r4h = vle32_v_f32m1(p4 + 4, vl); - vfloat32m1_t _r5l = vle32_v_f32m1(p5, vl); - vfloat32m1_t _r5h = vle32_v_f32m1(p5 + 4, vl); - vfloat32m1_t _r6l = vle32_v_f32m1(p6, vl); - vfloat32m1_t _r6h = vle32_v_f32m1(p6 + 4, vl); - vfloat32m1_t _r7l = vle32_v_f32m1(p7, vl); - vfloat32m1_t _r7h = vle32_v_f32m1(p7 + 4, vl); - - vsseg8e32_v_f32m1(pp, _r0l, _r1l, _r2l, _r3l, _r4l, _r5l, _r6l, _r7l, vl); - vsseg8e32_v_f32m1(pp + 32, _r0h, _r1h, _r2h, _r3h, _r4h, _r5h, _r6h, _r7h, vl); - - pp += 64; - p0 += 8; - p1 += 8; - p2 += 8; - p3 += 8; - p4 += 8; - p5 += 8; - p6 += 8; - p7 += 8; - } - for (; kk < max_kk; kk++) - { - pp[0] = p0[0]; - pp[1] = p1[0]; - pp[2] = p2[0]; - pp[3] = p3[0]; - pp[4] = p4[0]; - pp[5] = p5[0]; - pp[6] = p6[0]; - pp[7] = p7[0]; - pp += 8; - p0++; - p1++; - p2++; - p3++; - p4++; - p5++; - p6++; - p7++; - } - } - } - for (; ii + 3 < max_ii; ii += 4) - { - if (elempack == 4) - { - const float* p0 = (const float*)A + (i + ii) * A_hstep + k * 4; for (int kk = 0; kk < max_kk; kk++) { - vse32_v_f32m1(pp, vle32_v_f32m1(p0, vl), vl); - pp += 4; - p0 += 4; - } - } - if (elempack == 1) - { - const float* p0 = (const float*)A + (i + ii) * A_hstep + k; - const float* p1 = (const float*)A + (i + ii + 1) * A_hstep + k; - const float* p2 = (const float*)A + (i + ii + 2) * A_hstep + k; - const float* p3 = (const float*)A + (i + ii + 3) * A_hstep + k; - - int kk = 0; - for (; kk + 3 < max_kk; kk += 4) - { - vfloat32m1_t v0 = vle32_v_f32m1(p0, vl); - vfloat32m1_t v1 = vle32_v_f32m1(p1, vl); - vfloat32m1_t v2 = vle32_v_f32m1(p2, vl); - vfloat32m1_t v3 = vle32_v_f32m1(p3, vl); - vsseg4e32_v_f32m1(pp, v0, v1, v2, v3, vl); - pp += 16; - p0 += 4; - p1 += 4; - p2 += 4; - p3 += 4; - } - for (; kk < max_kk; kk++) - { - pp[0] = p0[0]; - pp[1] = p1[0]; - pp[2] = p2[0]; - pp[3] = p3[0]; - pp += 4; + __riscv_vse32_v_f32m1(pp, __riscv_vlse32_v_f32m1(p0, A_hstep * sizeof(float), vl), vl); + pp += packn; p0++; - p1++; - p2++; - p3++; } } } @@ -193,14 +86,14 @@ static void pack_A_tile(const Mat& A, Mat& AT, int i, int max_ii, int k, int max int kk = 0; #if __riscv_vector - for (; kk + 3 < max_kk; kk += 4) + for (; kk + (packn - 1) < max_kk; kk += packn) { - vfloat32m1_t v0 = vle32_v_f32m1(p0, vl); - vfloat32m1_t v1 = vle32_v_f32m1(p1, vl); - vsseg2e32_v_f32m1(pp, v0, v1, vl); - pp += 8; - p0 += 4; - p1 += 4; + vfloat32m1_t v0 = __riscv_vle32_v_f32m1(p0, vl); + vfloat32m1_t v1 = __riscv_vle32_v_f32m1(p1, vl); + __riscv_vsseg2e32_v_f32m1x2(pp, __riscv_vcreate_v_f32m1x2(v0, v1), vl); + pp += packn * 2; + p0 += packn; + p1 += packn; } #endif // __riscv_vector for (; kk < max_kk; kk++) @@ -221,11 +114,11 @@ static void pack_A_tile(const Mat& A, Mat& AT, int i, int max_ii, int k, int max int kk = 0; #if __riscv_vector - for (; kk + 3 < max_kk; kk += 4) + for (; kk + (packn - 1) < max_kk; kk += packn) { - vse32_v_f32m1(pp, vle32_v_f32m1(p0, vl), vl); - pp += 4; - p0 += 4; + __riscv_vse32_v_f32m1(pp, __riscv_vle32_v_f32m1(p0, vl), vl); + pp += packn; + p0 += packn; } #endif // __riscv_vector for (; kk < max_kk; kk++) @@ -238,80 +131,38 @@ static void pack_A_tile(const Mat& A, Mat& AT, int i, int max_ii, int k, int max } } -static void transpose_pack_A_tile(const Mat& A, Mat& AT, int i, int max_ii, int k, int max_kk, size_t vl) +static void transpose_pack_A_tile(const Mat& A, Mat& AT, int i, int max_ii, int k, int max_kk) { +#if __riscv_vector + const int packn = csrr_vlenb() / 4; + const size_t vl = __riscv_vsetvl_e32m1(packn); +#endif + const int elempack = A.elempack; const int A_hstep = A.dims == 3 ? (int)A.cstep : A.w; + // NCNN_LOGE("transpose_pack_A_tile %d", elempack); + float* pp = AT; int ii = 0; #if __riscv_vector - for (; ii + 7 < max_ii; ii += 8) - { - if (elempack == 4) - { - const float* p0 = (const float*)A + k * A_hstep + (i + ii) * 4; - - int kk = 0; - for (; kk + 3 < max_kk; kk += 4) - { - vfloat32m1_t _r0; - vfloat32m1_t _r1; - vfloat32m1_t _r2; - vfloat32m1_t _r3; - vfloat32m1_t _r4; - vfloat32m1_t _r5; - vfloat32m1_t _r6; - vfloat32m1_t _r7; - vlseg4e32_v_f32m1(&_r0, &_r1, &_r2, &_r3, p0, vl); - vlseg4e32_v_f32m1(&_r4, &_r5, &_r6, &_r7, p0 + 16, vl); - vse32_v_f32m1(pp, _r0, vl); - vse32_v_f32m1(pp + 4, _r4, vl); - vse32_v_f32m1(pp + 4 * 2, _r1, vl); - vse32_v_f32m1(pp + 4 * 3, _r5, vl); - vse32_v_f32m1(pp + 4 * 4, _r2, vl); - vse32_v_f32m1(pp + 4 * 5, _r6, vl); - vse32_v_f32m1(pp + 4 * 6, _r3, vl); - vse32_v_f32m1(pp + 4 * 7, _r7, vl); - pp += 32; - p0 += A_hstep * 4; - } - } - if (elempack == 1) - { - const float* p0 = (const float*)A + k * A_hstep + (i + ii); - - int kk = 0; - for (; kk < max_kk; kk++) - { - vse32_v_f32m1(pp, vle32_v_f32m1(p0, vl), vl); - vse32_v_f32m1(pp + 4, vle32_v_f32m1(p0 + 4, vl), vl); - pp += 8; - p0 += A_hstep; - } - } - } - for (; ii + 3 < max_ii; ii += 4) + for (; ii + (packn - 1) < max_ii; ii += packn) { - if (elempack == 4) + if (elempack == packn) { - const float* p0 = (const float*)A + k * A_hstep + (i + ii) * 4; + const float* p0 = (const float*)A + k * A_hstep + (i + ii) * packn; int kk = 0; - for (; kk + 3 < max_kk; kk += 4) + for (; kk + (packn - 1) < max_kk; kk += packn) { - vfloat32m1_t _r0; - vfloat32m1_t _r1; - vfloat32m1_t _r2; - vfloat32m1_t _r3; - vlseg4e32_v_f32m1(&_r0, &_r1, &_r2, &_r3, p0, vl); - vse32_v_f32m1(pp, _r0, vl); - vse32_v_f32m1(pp + 4, _r1, vl); - vse32_v_f32m1(pp + 4 * 2, _r2, vl); - vse32_v_f32m1(pp + 4 * 3, _r3, vl); - pp += 16; - p0 += A_hstep * 4; + // transposeNxN + for (int l = 0; l < packn; l++) + { + __riscv_vse32_v_f32m1(pp, __riscv_vlse32_v_f32m1(p0 + l, packn * sizeof(float), vl), vl); + pp += packn; + } + p0 += A_hstep * packn; } } if (elempack == 1) @@ -321,8 +172,8 @@ static void transpose_pack_A_tile(const Mat& A, Mat& AT, int i, int max_ii, int int kk = 0; for (; kk < max_kk; kk++) { - vse32_v_f32m1(pp, vle32_v_f32m1(p0, vl), vl); - pp += 4; + __riscv_vse32_v_f32m1(pp, __riscv_vle32_v_f32m1(p0, vl), vl); + pp += packn; p0 += A_hstep; } } @@ -331,18 +182,18 @@ static void transpose_pack_A_tile(const Mat& A, Mat& AT, int i, int max_ii, int for (; ii + 1 < max_ii; ii += 2) { #if __riscv_vector - if (elempack == 4) + if (elempack == packn) { - const float* p0 = (const float*)A + k * A_hstep + (i + ii) * 4; + const float* p0 = (const float*)A + k * A_hstep + (i + ii) * packn; int kk = 0; - for (; kk + 3 < max_kk; kk += 4) + for (; kk + (packn - 1) < max_kk; kk += packn) { - vfloat32m1_t v0 = vle32_v_f32m1(p0, vl); - vfloat32m1_t v1 = vle32_v_f32m1(p0 + 4, vl); - vsseg2e32_v_f32m1(pp, v0, v1, vl); - pp += 8; - p0 += A_hstep * 4; + vfloat32m1_t v0 = __riscv_vle32_v_f32m1(p0, vl); + vfloat32m1_t v1 = __riscv_vle32_v_f32m1(p0 + packn, vl); + __riscv_vsseg2e32_v_f32m1x2(pp, __riscv_vcreate_v_f32m1x2(v0, v1), vl); + pp += packn * 2; + p0 += A_hstep * packn; } } #endif // __riscv_vector @@ -363,16 +214,16 @@ static void transpose_pack_A_tile(const Mat& A, Mat& AT, int i, int max_ii, int for (; ii < max_ii; ii += 1) { #if __riscv_vector - if (elempack == 4) + if (elempack == packn) { - const float* p0 = (const float*)A + k * A_hstep + (i + ii) * 4; + const float* p0 = (const float*)A + k * A_hstep + (i + ii) * packn; int kk = 0; - for (; kk + 3 < max_kk; kk += 4) + for (; kk + (packn - 1) < max_kk; kk += packn) { - vse32_v_f32m1(pp, vle32_v_f32m1(p0, vl), vl); - pp += 4; - p0 += A_hstep * 4; + __riscv_vse32_v_f32m1(pp, __riscv_vle32_v_f32m1(p0, vl), vl); + pp += packn; + p0 += A_hstep * packn; } } #endif // __riscv_vector @@ -391,243 +242,44 @@ static void transpose_pack_A_tile(const Mat& A, Mat& AT, int i, int max_ii, int } } -static void pack_B_tile(const Mat& B, Mat& BT, int j, int max_jj, int k, int max_kk, size_t vl) +static void pack_B_tile(const Mat& B, Mat& BT, int j, int max_jj, int k, int max_kk) { +#if __riscv_vector + const int packn = csrr_vlenb() / 4; + const size_t vl = __riscv_vsetvl_e32m1(packn); +#endif + const int elempack = B.elempack; const int B_hstep = B.dims == 3 ? (int)B.cstep : B.w; + // NCNN_LOGE("pack_B_tile %d", elempack); + float* pp = BT; int jj = 0; #if __riscv_vector - for (; jj + 11 < max_jj; jj += 12) - { - if (elempack == 4) - { - const float* p0 = (const float*)B + (j + jj) * B_hstep + k * 4; - const float* p1 = (const float*)B + (j + jj + 4) * B_hstep + k * 4; - const float* p2 = (const float*)B + (j + jj + 8) * B_hstep + k * 4; - - for (int kk = 0; kk < max_kk; kk++) - { - vse32_v_f32m1(pp, vle32_v_f32m1(p0, vl), vl); - vse32_v_f32m1(pp + 4, vle32_v_f32m1(p1, vl), vl); - vse32_v_f32m1(pp + 8, vle32_v_f32m1(p2, vl), vl); - pp += 12; - p0 += 4; - p1 += 4; - p2 += 4; - } - } - if (elempack == 1) - { - const float* p0 = (const float*)B + (j + jj) * B_hstep + k; - const float* p1 = (const float*)B + (j + jj + 1) * B_hstep + k; - const float* p2 = (const float*)B + (j + jj + 2) * B_hstep + k; - const float* p3 = (const float*)B + (j + jj + 3) * B_hstep + k; - const float* p4 = (const float*)B + (j + jj + 4) * B_hstep + k; - const float* p5 = (const float*)B + (j + jj + 5) * B_hstep + k; - const float* p6 = (const float*)B + (j + jj + 6) * B_hstep + k; - const float* p7 = (const float*)B + (j + jj + 7) * B_hstep + k; - const float* p8 = (const float*)B + (j + jj + 8) * B_hstep + k; - const float* p9 = (const float*)B + (j + jj + 9) * B_hstep + k; - const float* pa = (const float*)B + (j + jj + 10) * B_hstep + k; - const float* pb = (const float*)B + (j + jj + 11) * B_hstep + k; - - int kk = 0; - for (; kk + 3 < max_kk; kk += 4) - { - vfloat32m1_t _r0 = vle32_v_f32m1(p0, vl); - vfloat32m1_t _r1 = vle32_v_f32m1(p1, vl); - vfloat32m1_t _r2 = vle32_v_f32m1(p2, vl); - vfloat32m1_t _r3 = vle32_v_f32m1(p3, vl); - vfloat32m1_t _r4 = vle32_v_f32m1(p4, vl); - vfloat32m1_t _r5 = vle32_v_f32m1(p5, vl); - vfloat32m1_t _r6 = vle32_v_f32m1(p6, vl); - vfloat32m1_t _r7 = vle32_v_f32m1(p7, vl); - vfloat32m1_t _r8 = vle32_v_f32m1(p8, vl); - vfloat32m1_t _r9 = vle32_v_f32m1(p9, vl); - vfloat32m1_t _ra = vle32_v_f32m1(pa, vl); - vfloat32m1_t _rb = vle32_v_f32m1(pb, vl); - - transpose4x4_ps(_r0, _r1, _r2, _r3, vl); - transpose4x4_ps(_r4, _r5, _r6, _r7, vl); - transpose4x4_ps(_r8, _r9, _ra, _rb, vl); - - vse32_v_f32m1(pp, _r0, vl); - vse32_v_f32m1(pp + 4, _r4, vl); - vse32_v_f32m1(pp + 4 * 2, _r8, vl); - vse32_v_f32m1(pp + 4 * 3, _r1, vl); - vse32_v_f32m1(pp + 4 * 4, _r5, vl); - vse32_v_f32m1(pp + 4 * 5, _r9, vl); - vse32_v_f32m1(pp + 4 * 6, _r2, vl); - vse32_v_f32m1(pp + 4 * 7, _r6, vl); - vse32_v_f32m1(pp + 4 * 8, _ra, vl); - vse32_v_f32m1(pp + 4 * 9, _r3, vl); - vse32_v_f32m1(pp + 4 * 10, _r7, vl); - vse32_v_f32m1(pp + 4 * 11, _rb, vl); - pp += 48; - p0 += 4; - p1 += 4; - p2 += 4; - p3 += 4; - p4 += 4; - p5 += 4; - p6 += 4; - p7 += 4; - p8 += 4; - p9 += 4; - pa += 4; - pb += 4; - } - for (; kk < max_kk; kk++) - { - pp[0] = p0[0]; - pp[1] = p1[0]; - pp[2] = p2[0]; - pp[3] = p3[0]; - pp[4] = p4[0]; - pp[5] = p5[0]; - pp[6] = p6[0]; - pp[7] = p7[0]; - pp[8] = p8[0]; - pp[9] = p9[0]; - pp[10] = pa[0]; - pp[11] = pb[0]; - pp += 12; - p0++; - p1++; - p2++; - p3++; - p4++; - p5++; - p6++; - p7++; - p8++; - p9++; - pa++; - pb++; - } - } - } - for (; jj + 7 < max_jj; jj += 8) + for (; jj + (packn - 1) < max_jj; jj += packn) { - if (elempack == 4) + if (elempack == packn) { - const float* p0 = (const float*)B + (j + jj) * B_hstep + k * 4; - const float* p1 = (const float*)B + (j + jj + 4) * B_hstep + k * 4; + const float* p0 = (const float*)B + (j + jj) * B_hstep + k * packn; for (int kk = 0; kk < max_kk; kk++) { - vse32_v_f32m1(pp, vle32_v_f32m1(p0, vl), vl); - vse32_v_f32m1(pp + 4, vle32_v_f32m1(p1, vl), vl); - pp += 8; - p0 += 4; - p1 += 4; + __riscv_vse32_v_f32m1(pp, __riscv_vle32_v_f32m1(p0, vl), vl); + pp += packn; + p0 += packn; } } if (elempack == 1) { const float* p0 = (const float*)B + (j + jj) * B_hstep + k; - const float* p1 = (const float*)B + (j + jj + 1) * B_hstep + k; - const float* p2 = (const float*)B + (j + jj + 2) * B_hstep + k; - const float* p3 = (const float*)B + (j + jj + 3) * B_hstep + k; - const float* p4 = (const float*)B + (j + jj + 4) * B_hstep + k; - const float* p5 = (const float*)B + (j + jj + 5) * B_hstep + k; - const float* p6 = (const float*)B + (j + jj + 6) * B_hstep + k; - const float* p7 = (const float*)B + (j + jj + 7) * B_hstep + k; - - int kk = 0; - for (; kk + 3 < max_kk; kk += 4) - { - vfloat32m1_t _r0 = vle32_v_f32m1(p0, vl); - vfloat32m1_t _r1 = vle32_v_f32m1(p1, vl); - vfloat32m1_t _r2 = vle32_v_f32m1(p2, vl); - vfloat32m1_t _r3 = vle32_v_f32m1(p3, vl); - vfloat32m1_t _r4 = vle32_v_f32m1(p4, vl); - vfloat32m1_t _r5 = vle32_v_f32m1(p5, vl); - vfloat32m1_t _r6 = vle32_v_f32m1(p6, vl); - vfloat32m1_t _r7 = vle32_v_f32m1(p7, vl); - - vsseg8e32_v_f32m1(pp, _r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7, vl); - - pp += 32; - p0 += 4; - p1 += 4; - p2 += 4; - p3 += 4; - p4 += 4; - p5 += 4; - p6 += 4; - p7 += 4; - } - for (; kk < max_kk; kk++) - { - pp[0] = p0[0]; - pp[1] = p1[0]; - pp[2] = p2[0]; - pp[3] = p3[0]; - pp[4] = p4[0]; - pp[5] = p5[0]; - pp[6] = p6[0]; - pp[7] = p7[0]; - pp += 8; - p0++; - p1++; - p2++; - p3++; - p4++; - p5++; - p6++; - p7++; - } - } - } - for (; jj + 3 < max_jj; jj += 4) - { - if (elempack == 4) - { - const float* p0 = (const float*)B + (j + jj) * B_hstep + k * 4; for (int kk = 0; kk < max_kk; kk++) { - vse32_v_f32m1(pp, vle32_v_f32m1(p0, vl), vl); - pp += 4; - p0 += 4; - } - } - if (elempack == 1) - { - const float* p0 = (const float*)B + (j + jj) * B_hstep + k; - const float* p1 = (const float*)B + (j + jj + 1) * B_hstep + k; - const float* p2 = (const float*)B + (j + jj + 2) * B_hstep + k; - const float* p3 = (const float*)B + (j + jj + 3) * B_hstep + k; - - int kk = 0; - for (; kk + 3 < max_kk; kk += 4) - { - vfloat32m1_t v0 = vle32_v_f32m1(p0, vl); - vfloat32m1_t v1 = vle32_v_f32m1(p1, vl); - vfloat32m1_t v2 = vle32_v_f32m1(p2, vl); - vfloat32m1_t v3 = vle32_v_f32m1(p3, vl); - vsseg4e32_v_f32m1(pp, v0, v1, v2, v3, vl); - pp += 16; - p0 += 4; - p1 += 4; - p2 += 4; - p3 += 4; - } - for (; kk < max_kk; kk++) - { - pp[0] = p0[0]; - pp[1] = p1[0]; - pp[2] = p2[0]; - pp[3] = p3[0]; - pp += 4; + __riscv_vse32_v_f32m1(pp, __riscv_vlse32_v_f32m1(p0, B_hstep * sizeof(float), vl), vl); + pp += packn; p0++; - p1++; - p2++; - p3++; } } } @@ -641,14 +293,14 @@ static void pack_B_tile(const Mat& B, Mat& BT, int j, int max_jj, int k, int max int kk = 0; #if __riscv_vector - for (; kk + 3 < max_kk; kk += 4) + for (; kk + (packn - 1) < max_kk; kk += packn) { - vfloat32m1_t v0 = vle32_v_f32m1(p0, vl); - vfloat32m1_t v1 = vle32_v_f32m1(p1, vl); - vsseg2e32_v_f32m1(pp, v0, v1, vl); - pp += 8; - p0 += 4; - p1 += 4; + vfloat32m1_t v0 = __riscv_vle32_v_f32m1(p0, vl); + vfloat32m1_t v1 = __riscv_vle32_v_f32m1(p1, vl); + __riscv_vsseg2e32_v_f32m1x2(pp, __riscv_vcreate_v_f32m1x2(v0, v1), vl); + pp += packn * 2; + p0 += packn; + p1 += packn; } #endif // __riscv_vector for (; kk < max_kk; kk++) @@ -669,11 +321,11 @@ static void pack_B_tile(const Mat& B, Mat& BT, int j, int max_jj, int k, int max int kk = 0; #if __riscv_vector - for (; kk + 3 < max_kk; kk += 4) + for (; kk + (packn - 1) < max_kk; kk += packn) { - vse32_v_f32m1(pp, vle32_v_f32m1(p0, vl), vl); - pp += 4; - p0 += 4; + __riscv_vse32_v_f32m1(pp, __riscv_vle32_v_f32m1(p0, vl), vl); + pp += packn; + p0 += packn; } #endif // __riscv_vector for (; kk < max_kk; kk++) @@ -686,135 +338,38 @@ static void pack_B_tile(const Mat& B, Mat& BT, int j, int max_jj, int k, int max } } -static void transpose_pack_B_tile(const Mat& B, Mat& BT, int j, int max_jj, int k, int max_kk, size_t vl) +static void transpose_pack_B_tile(const Mat& B, Mat& BT, int j, int max_jj, int k, int max_kk) { +#if __riscv_vector + const int packn = csrr_vlenb() / 4; + const size_t vl = __riscv_vsetvl_e32m1(packn); +#endif + const int elempack = B.elempack; const int B_hstep = B.dims == 3 ? (int)B.cstep : B.w; + // NCNN_LOGE("transpose_pack_B_tile %d", elempack); + float* pp = BT; int jj = 0; #if __riscv_vector - for (; jj + 11 < max_jj; jj += 12) - { - if (elempack == 4) - { - const float* p0 = (const float*)B + k * B_hstep + (j + jj) * 4; - - int kk = 0; - for (; kk + 3 < max_kk; kk += 4) - { - vfloat32m1_t _r0; - vfloat32m1_t _r1; - vfloat32m1_t _r2; - vfloat32m1_t _r3; - vfloat32m1_t _r4; - vfloat32m1_t _r5; - vfloat32m1_t _r6; - vfloat32m1_t _r7; - vfloat32m1_t _r8; - vfloat32m1_t _r9; - vfloat32m1_t _ra; - vfloat32m1_t _rb; - vlseg4e32_v_f32m1(&_r0, &_r1, &_r2, &_r3, p0, vl); - vlseg4e32_v_f32m1(&_r4, &_r5, &_r6, &_r7, p0 + 16, vl); - vlseg4e32_v_f32m1(&_r8, &_r9, &_ra, &_rb, p0 + 32, vl); - vse32_v_f32m1(pp, _r0, vl); - vse32_v_f32m1(pp + 4, _r4, vl); - vse32_v_f32m1(pp + 4 * 2, _r8, vl); - vse32_v_f32m1(pp + 4 * 3, _r1, vl); - vse32_v_f32m1(pp + 4 * 4, _r5, vl); - vse32_v_f32m1(pp + 4 * 5, _r9, vl); - vse32_v_f32m1(pp + 4 * 6, _r2, vl); - vse32_v_f32m1(pp + 4 * 7, _r6, vl); - vse32_v_f32m1(pp + 4 * 8, _ra, vl); - vse32_v_f32m1(pp + 4 * 9, _r3, vl); - vse32_v_f32m1(pp + 4 * 10, _r7, vl); - vse32_v_f32m1(pp + 4 * 11, _rb, vl); - pp += 48; - p0 += B_hstep * 4; - } - } - if (elempack == 1) - { - const float* p0 = (const float*)B + k * B_hstep + (j + jj); - - int kk = 0; - for (; kk < max_kk; kk++) - { - vse32_v_f32m1(pp, vle32_v_f32m1(p0, vl), vl); - vse32_v_f32m1(pp + 4, vle32_v_f32m1(p0 + 4, vl), vl); - vse32_v_f32m1(pp + 8, vle32_v_f32m1(p0 + 8, vl), vl); - pp += 12; - p0 += B_hstep; - } - } - } - for (; jj + 7 < max_jj; jj += 8) - { - if (elempack == 4) - { - const float* p0 = (const float*)B + k * B_hstep + (j + jj) * 4; - - int kk = 0; - for (; kk + 3 < max_kk; kk += 4) - { - vfloat32m1_t _r0; - vfloat32m1_t _r1; - vfloat32m1_t _r2; - vfloat32m1_t _r3; - vfloat32m1_t _r4; - vfloat32m1_t _r5; - vfloat32m1_t _r6; - vfloat32m1_t _r7; - vlseg4e32_v_f32m1(&_r0, &_r1, &_r2, &_r3, p0, vl); - vlseg4e32_v_f32m1(&_r4, &_r5, &_r6, &_r7, p0 + 16, vl); - vse32_v_f32m1(pp, _r0, vl); - vse32_v_f32m1(pp + 4, _r4, vl); - vse32_v_f32m1(pp + 4 * 2, _r1, vl); - vse32_v_f32m1(pp + 4 * 3, _r5, vl); - vse32_v_f32m1(pp + 4 * 4, _r2, vl); - vse32_v_f32m1(pp + 4 * 5, _r6, vl); - vse32_v_f32m1(pp + 4 * 6, _r3, vl); - vse32_v_f32m1(pp + 4 * 7, _r7, vl); - pp += 32; - p0 += B_hstep * 4; - } - } - if (elempack == 1) - { - const float* p0 = (const float*)B + k * B_hstep + (j + jj); - - int kk = 0; - for (; kk < max_kk; kk++) - { - vse32_v_f32m1(pp, vle32_v_f32m1(p0, vl), vl); - vse32_v_f32m1(pp + 4, vle32_v_f32m1(p0 + 4, vl), vl); - pp += 8; - p0 += B_hstep; - } - } - } - for (; jj + 3 < max_jj; jj += 4) + for (; jj + (packn - 1) < max_jj; jj += packn) { - if (elempack == 4) + if (elempack == packn) { - const float* p0 = (const float*)B + k * B_hstep + (j + jj) * 4; + const float* p0 = (const float*)B + k * B_hstep + (j + jj) * packn; int kk = 0; - for (; kk + 3 < max_kk; kk += 4) + for (; kk + (packn - 1) < max_kk; kk += packn) { - vfloat32m1_t _r0; - vfloat32m1_t _r1; - vfloat32m1_t _r2; - vfloat32m1_t _r3; - vlseg4e32_v_f32m1(&_r0, &_r1, &_r2, &_r3, p0, vl); - vse32_v_f32m1(pp, _r0, vl); - vse32_v_f32m1(pp + 4, _r1, vl); - vse32_v_f32m1(pp + 4 * 2, _r2, vl); - vse32_v_f32m1(pp + 4 * 3, _r3, vl); - pp += 16; - p0 += B_hstep * 4; + // transposeNxN + for (int l = 0; l < packn; l++) + { + __riscv_vse32_v_f32m1(pp, __riscv_vlse32_v_f32m1(p0 + l, packn * sizeof(float), vl), vl); + pp += packn; + } + p0 += B_hstep * packn; } } if (elempack == 1) @@ -824,8 +379,8 @@ static void transpose_pack_B_tile(const Mat& B, Mat& BT, int j, int max_jj, int int kk = 0; for (; kk < max_kk; kk++) { - vse32_v_f32m1(pp, vle32_v_f32m1(p0, vl), vl); - pp += 4; + __riscv_vse32_v_f32m1(pp, __riscv_vle32_v_f32m1(p0, vl), vl); + pp += packn; p0 += B_hstep; } } @@ -834,18 +389,18 @@ static void transpose_pack_B_tile(const Mat& B, Mat& BT, int j, int max_jj, int for (; jj + 1 < max_jj; jj += 2) { #if __riscv_vector - if (elempack == 4) + if (elempack == packn) { - const float* p0 = (const float*)B + k * B_hstep + (j + jj) * 4; + const float* p0 = (const float*)B + k * B_hstep + (j + jj) * packn; int kk = 0; - for (; kk + 3 < max_kk; kk += 4) + for (; kk + (packn - 1) < max_kk; kk += packn) { - vfloat32m1_t v0 = vle32_v_f32m1(p0, vl); - vfloat32m1_t v1 = vle32_v_f32m1(p0 + 4, vl); - vsseg2e32_v_f32m1(pp, v0, v1, vl); - pp += 8; - p0 += B_hstep * 4; + vfloat32m1_t v0 = __riscv_vle32_v_f32m1(p0, vl); + vfloat32m1_t v1 = __riscv_vle32_v_f32m1(p0 + packn, vl); + __riscv_vsseg2e32_v_f32m1x2(pp, __riscv_vcreate_v_f32m1x2(v0, v1), vl); + pp += packn * 2; + p0 += B_hstep * packn; } } #endif // __riscv_vector @@ -866,16 +421,16 @@ static void transpose_pack_B_tile(const Mat& B, Mat& BT, int j, int max_jj, int for (; jj < max_jj; jj += 1) { #if __riscv_vector - if (elempack == 4) + if (elempack == packn) { - const float* p0 = (const float*)B + k * B_hstep + (j + jj) * 4; + const float* p0 = (const float*)B + k * B_hstep + (j + jj) * packn; int kk = 0; - for (; kk + 3 < max_kk; kk += 4) + for (; kk + (packn - 1) < max_kk; kk += packn) { - vse32_v_f32m1(pp, vle32_v_f32m1(p0, vl), vl); - pp += 4; - p0 += B_hstep * 4; + __riscv_vse32_v_f32m1(pp, __riscv_vle32_v_f32m1(p0, vl), vl); + pp += packn; + p0 += B_hstep * packn; } } #endif // __riscv_vector @@ -894,67 +449,37 @@ static void transpose_pack_B_tile(const Mat& B, Mat& BT, int j, int max_jj, int } } -static void transpose_unpack_output_tile(const Mat& topT, Mat& top_blob, int i, int max_ii, int j, int max_jj, size_t vl) +static void transpose_unpack_output_tile(const Mat& topT, Mat& top_blob, int i, int max_ii, int j, int max_jj) { +#if __riscv_vector + const int packn = csrr_vlenb() / 4; + const size_t vl = __riscv_vsetvl_e32m1(packn); +#endif + const int out_elempack = top_blob.elempack; const int out_hstep = top_blob.dims == 3 ? (int)top_blob.cstep : top_blob.w; + // NCNN_LOGE("transpose_unpack_output_tile %d", out_elempack); + const float* pp = topT; int ii = 0; #if __riscv_vector - for (; ii + 7 < max_ii; ii += 8) - { - if (out_elempack == 4) - { - float* p0 = (float*)top_blob + j * out_hstep + (i + ii) * 4; - - for (int jj = 0; jj + 3 < max_jj; jj += 4) - { - vfloat32m1_t v0 = vle32_v_f32m1(pp, vl); - vfloat32m1_t v1 = vle32_v_f32m1(pp + 8, vl); - vfloat32m1_t v2 = vle32_v_f32m1(pp + 16, vl); - vfloat32m1_t v3 = vle32_v_f32m1(pp + 24, vl); - vsseg4e32_v_f32m1(p0, v0, v1, v2, v3, vl); - v0 = vle32_v_f32m1(pp + 4, vl); - v1 = vle32_v_f32m1(pp + 12, vl); - v2 = vle32_v_f32m1(pp + 20, vl); - v3 = vle32_v_f32m1(pp + 28, vl); - vsseg4e32_v_f32m1(p0 + 16, v0, v1, v2, v3, vl); - pp += 32; - p0 += out_hstep * 4; - } - } - if (out_elempack == 1) - { - float* p0 = (float*)top_blob + j * out_hstep + (i + ii); - - for (int jj = 0; jj < max_jj; jj += 1) - { - vfloat32m1_t _r0 = vle32_v_f32m1(pp, vl); - vfloat32m1_t _r1 = vle32_v_f32m1(pp + 4, vl); - vse32_v_f32m1(p0, _r0, vl); - vse32_v_f32m1(p0 + 4, _r1, vl); - pp += 8; - p0 += out_hstep; - } - } - } - for (; ii + 3 < max_ii; ii += 4) + for (; ii + (packn - 1) < max_ii; ii += packn) { - if (out_elempack == 4) + if (out_elempack == packn) { - float* p0 = (float*)top_blob + j * out_hstep + (i + ii) * 4; + float* p0 = (float*)top_blob + j * out_hstep + (i + ii) * packn; - for (int jj = 0; jj + 3 < max_jj; jj += 4) + for (int jj = 0; jj + (packn - 1) < max_jj; jj += packn) { - vfloat32m1_t v0 = vle32_v_f32m1(pp, vl); - vfloat32m1_t v1 = vle32_v_f32m1(pp + 4, vl); - vfloat32m1_t v2 = vle32_v_f32m1(pp + 8, vl); - vfloat32m1_t v3 = vle32_v_f32m1(pp + 12, vl); - vsseg4e32_v_f32m1(p0, v0, v1, v2, v3, vl); - pp += 16; - p0 += out_hstep * 4; + // transposeNxN + for (int l = 0; l < packn; l++) + { + __riscv_vsse32_v_f32m1(p0 + l, packn * sizeof(float), __riscv_vle32_v_f32m1(pp, vl), vl); + pp += packn; + } + p0 += out_hstep * packn; } } if (out_elempack == 1) @@ -963,9 +488,9 @@ static void transpose_unpack_output_tile(const Mat& topT, Mat& top_blob, int i, for (int jj = 0; jj < max_jj; jj += 1) { - vfloat32m1_t _r0 = vle32_v_f32m1(pp, vl); - vse32_v_f32m1(p0, _r0, vl); - pp += 4; + vfloat32m1_t _r0 = __riscv_vle32_v_f32m1(pp, vl); + __riscv_vse32_v_f32m1(p0, _r0, vl); + pp += packn; p0 += out_hstep; } } @@ -974,22 +499,17 @@ static void transpose_unpack_output_tile(const Mat& topT, Mat& top_blob, int i, for (; ii + 1 < max_ii; ii += 2) { #if __riscv_vector - if (out_elempack == 4) + if (out_elempack == packn) { - float* p0 = (float*)top_blob + j * out_hstep + (i + ii) * 4; + float* p0 = (float*)top_blob + j * out_hstep + (i + ii) * packn; - for (int jj = 0; jj + 3 < max_jj; jj += 4) + for (int jj = 0; jj + (packn - 1) < max_jj; jj += packn) { - p0[0] = pp[0]; - p0[1] = pp[2]; - p0[2] = pp[4]; - p0[3] = pp[6]; - p0[4] = pp[1]; - p0[5] = pp[3]; - p0[6] = pp[5]; - p0[7] = pp[7]; - pp += 8; - p0 += out_hstep * 4; + vfloat32m1x2_t _s0 = __riscv_vlseg2e32_v_f32m1x2(pp, vl); + __riscv_vse32_v_f32m1(p0, __riscv_vget_v_f32m1x2_f32m1(_s0, 0), vl); + __riscv_vse32_v_f32m1(p0 + packn, __riscv_vget_v_f32m1x2_f32m1(_s0, 1), vl); + pp += packn * 2; + p0 += out_hstep * packn; } } #endif // __riscv_vector @@ -1009,16 +529,16 @@ static void transpose_unpack_output_tile(const Mat& topT, Mat& top_blob, int i, for (; ii < max_ii; ii += 1) { #if __riscv_vector - if (out_elempack == 4) + if (out_elempack == packn) { - float* p0 = (float*)top_blob + j * out_hstep + (i + ii) * 4; + float* p0 = (float*)top_blob + j * out_hstep + (i + ii) * packn; - for (int jj = 0; jj + 3 < max_jj; jj += 4) + for (int jj = 0; jj + (packn - 1) < max_jj; jj += packn) { - vfloat32m1_t _r0 = vle32_v_f32m1(pp, vl); - vse32_v_f32m1(p0, _r0, vl); - pp += 4; - p0 += out_hstep * 4; + vfloat32m1_t _r0 = __riscv_vle32_v_f32m1(pp, vl); + __riscv_vse32_v_f32m1(p0, _r0, vl); + pp += packn; + p0 += out_hstep * packn; } } #endif // __riscv_vector @@ -1036,8 +556,13 @@ static void transpose_unpack_output_tile(const Mat& topT, Mat& top_blob, int i, } } -static void gemm_transB_packed_tile(const Mat& AT_tile, const Mat& BT_tile, const Mat& CT_tile, Mat& topT_tile, Mat& top_blob, int broadcast_type_C, int i, int max_ii, int j, int max_jj, int k, int max_kk, bool k_end, size_t vl) +static void gemm_transB_packed_tile(const Mat& AT_tile, const Mat& BT_tile, const Mat& CT_tile, Mat& topT_tile, Mat& top_blob, int broadcast_type_C, int i, int max_ii, int j, int max_jj, int k, int max_kk, bool k_end) { +#if __riscv_vector + const int packn = csrr_vlenb() / 4; + const size_t vl = __riscv_vsetvl_e32m1(packn); +#endif + const int out_elempack = top_blob.elempack; const int out_hstep = top_blob.dims == 3 ? (int)top_blob.cstep : top_blob.w; @@ -1049,7 +574,7 @@ static void gemm_transB_packed_tile(const Mat& AT_tile, const Mat& BT_tile, cons int ii = 0; #if __riscv_vector - for (; ii + 7 < max_ii; ii += 8) + for (; ii + (packn - 1) < max_ii; ii += packn) { float* outptr0 = (float*)top_blob + (i + ii) * out_hstep + j * out_elempack; @@ -1059,1526 +584,253 @@ static void gemm_transB_packed_tile(const Mat& AT_tile, const Mat& BT_tile, cons { if (broadcast_type_C == 1 || broadcast_type_C == 2) { - pC = (const float*)CT_tile + i + ii; - } - if (broadcast_type_C == 4) - { - pC = (const float*)CT_tile + j; - } - } - - int jj = 0; - for (; jj + 11 < max_jj; jj += 12) - { - vfloat32m1_t _sum00; - vfloat32m1_t _sum01; - vfloat32m1_t _sum10; - vfloat32m1_t _sum11; - vfloat32m1_t _sum20; - vfloat32m1_t _sum21; - vfloat32m1_t _sum30; - vfloat32m1_t _sum31; - vfloat32m1_t _sum40; - vfloat32m1_t _sum41; - vfloat32m1_t _sum50; - vfloat32m1_t _sum51; - vfloat32m1_t _sum60; - vfloat32m1_t _sum61; - vfloat32m1_t _sum70; - vfloat32m1_t _sum71; - vfloat32m1_t _sum80; - vfloat32m1_t _sum81; - vfloat32m1_t _sum90; - vfloat32m1_t _sum91; - vfloat32m1_t _suma0; - vfloat32m1_t _suma1; - vfloat32m1_t _sumb0; - vfloat32m1_t _sumb1; - - if (k == 0) - { - _sum00 = vfmv_v_f_f32m1(0.f, vl); - _sum01 = vfmv_v_f_f32m1(0.f, vl); - _sum10 = vfmv_v_f_f32m1(0.f, vl); - _sum11 = vfmv_v_f_f32m1(0.f, vl); - _sum20 = vfmv_v_f_f32m1(0.f, vl); - _sum21 = vfmv_v_f_f32m1(0.f, vl); - _sum30 = vfmv_v_f_f32m1(0.f, vl); - _sum31 = vfmv_v_f_f32m1(0.f, vl); - _sum40 = vfmv_v_f_f32m1(0.f, vl); - _sum41 = vfmv_v_f_f32m1(0.f, vl); - _sum50 = vfmv_v_f_f32m1(0.f, vl); - _sum51 = vfmv_v_f_f32m1(0.f, vl); - _sum60 = vfmv_v_f_f32m1(0.f, vl); - _sum61 = vfmv_v_f_f32m1(0.f, vl); - _sum70 = vfmv_v_f_f32m1(0.f, vl); - _sum71 = vfmv_v_f_f32m1(0.f, vl); - _sum80 = vfmv_v_f_f32m1(0.f, vl); - _sum81 = vfmv_v_f_f32m1(0.f, vl); - _sum90 = vfmv_v_f_f32m1(0.f, vl); - _sum91 = vfmv_v_f_f32m1(0.f, vl); - _suma0 = vfmv_v_f_f32m1(0.f, vl); - _suma1 = vfmv_v_f_f32m1(0.f, vl); - _sumb0 = vfmv_v_f_f32m1(0.f, vl); - _sumb1 = vfmv_v_f_f32m1(0.f, vl); - - if (pC) - { - if (broadcast_type_C == 0) - { - _sum00 = vfmv_v_f_f32m1(pC[0], vl); - _sum01 = vfmv_v_f_f32m1(pC[0], vl); - _sum10 = vfmv_v_f_f32m1(pC[0], vl); - _sum11 = vfmv_v_f_f32m1(pC[0], vl); - _sum20 = vfmv_v_f_f32m1(pC[0], vl); - _sum21 = vfmv_v_f_f32m1(pC[0], vl); - _sum30 = vfmv_v_f_f32m1(pC[0], vl); - _sum31 = vfmv_v_f_f32m1(pC[0], vl); - _sum40 = vfmv_v_f_f32m1(pC[0], vl); - _sum41 = vfmv_v_f_f32m1(pC[0], vl); - _sum50 = vfmv_v_f_f32m1(pC[0], vl); - _sum51 = vfmv_v_f_f32m1(pC[0], vl); - _sum60 = vfmv_v_f_f32m1(pC[0], vl); - _sum61 = vfmv_v_f_f32m1(pC[0], vl); - _sum70 = vfmv_v_f_f32m1(pC[0], vl); - _sum71 = vfmv_v_f_f32m1(pC[0], vl); - _sum80 = vfmv_v_f_f32m1(pC[0], vl); - _sum81 = vfmv_v_f_f32m1(pC[0], vl); - _sum90 = vfmv_v_f_f32m1(pC[0], vl); - _sum91 = vfmv_v_f_f32m1(pC[0], vl); - _suma0 = vfmv_v_f_f32m1(pC[0], vl); - _suma1 = vfmv_v_f_f32m1(pC[0], vl); - _sumb0 = vfmv_v_f_f32m1(pC[0], vl); - _sumb1 = vfmv_v_f_f32m1(pC[0], vl); - } - if (broadcast_type_C == 1 || broadcast_type_C == 2) - { - _sum00 = vle32_v_f32m1(pC, vl); - _sum01 = vle32_v_f32m1(pC + 4, vl); - _sum10 = _sum00; - _sum11 = _sum01; - _sum20 = _sum00; - _sum21 = _sum01; - _sum30 = _sum00; - _sum31 = _sum01; - _sum40 = _sum00; - _sum41 = _sum01; - _sum50 = _sum00; - _sum51 = _sum01; - _sum60 = _sum00; - _sum61 = _sum01; - _sum70 = _sum00; - _sum71 = _sum01; - _sum80 = _sum00; - _sum81 = _sum01; - _sum90 = _sum00; - _sum91 = _sum01; - _suma0 = _sum00; - _suma1 = _sum01; - _sumb0 = _sum00; - _sumb1 = _sum01; - } - if (broadcast_type_C == 3) - { - _sum00 = vle32_v_f32m1(pC, vl); - _sum01 = vle32_v_f32m1(pC + 4 * 1, vl); - _sum10 = vle32_v_f32m1(pC + 4 * 2, vl); - _sum11 = vle32_v_f32m1(pC + 4 * 3, vl); - _sum20 = vle32_v_f32m1(pC + 4 * 4, vl); - _sum21 = vle32_v_f32m1(pC + 4 * 5, vl); - _sum30 = vle32_v_f32m1(pC + 4 * 6, vl); - _sum31 = vle32_v_f32m1(pC + 4 * 7, vl); - _sum40 = vle32_v_f32m1(pC + 4 * 8, vl); - _sum41 = vle32_v_f32m1(pC + 4 * 9, vl); - _sum50 = vle32_v_f32m1(pC + 4 * 10, vl); - _sum51 = vle32_v_f32m1(pC + 4 * 11, vl); - _sum60 = vle32_v_f32m1(pC + 4 * 12, vl); - _sum61 = vle32_v_f32m1(pC + 4 * 13, vl); - _sum70 = vle32_v_f32m1(pC + 4 * 14, vl); - _sum71 = vle32_v_f32m1(pC + 4 * 15, vl); - _sum80 = vle32_v_f32m1(pC + 4 * 16, vl); - _sum81 = vle32_v_f32m1(pC + 4 * 17, vl); - _sum90 = vle32_v_f32m1(pC + 4 * 18, vl); - _sum91 = vle32_v_f32m1(pC + 4 * 19, vl); - _suma0 = vle32_v_f32m1(pC + 4 * 20, vl); - _suma1 = vle32_v_f32m1(pC + 4 * 21, vl); - _sumb0 = vle32_v_f32m1(pC + 4 * 22, vl); - _sumb1 = vle32_v_f32m1(pC + 4 * 23, vl); - pC += 96; - } - if (broadcast_type_C == 4) - { - _sum00 = vfmv_v_f_f32m1(pC[0], vl); - _sum10 = vfmv_v_f_f32m1(pC[1], vl); - _sum20 = vfmv_v_f_f32m1(pC[2], vl); - _sum30 = vfmv_v_f_f32m1(pC[3], vl); - _sum40 = vfmv_v_f_f32m1(pC[4], vl); - _sum50 = vfmv_v_f_f32m1(pC[5], vl); - _sum60 = vfmv_v_f_f32m1(pC[6], vl); - _sum70 = vfmv_v_f_f32m1(pC[7], vl); - _sum80 = vfmv_v_f_f32m1(pC[8], vl); - _sum90 = vfmv_v_f_f32m1(pC[9], vl); - _suma0 = vfmv_v_f_f32m1(pC[10], vl); - _sumb0 = vfmv_v_f_f32m1(pC[11], vl); - _sum01 = _sum00; - _sum11 = _sum10; - _sum21 = _sum20; - _sum31 = _sum30; - _sum41 = _sum40; - _sum51 = _sum50; - _sum61 = _sum60; - _sum71 = _sum70; - _sum81 = _sum80; - _sum91 = _sum90; - _suma1 = _suma0; - _sumb1 = _sumb0; - pC += 12; - } - } - } - else - { - _sum00 = vle32_v_f32m1(outptr, vl); - _sum01 = vle32_v_f32m1(outptr + 4 * 1, vl); - _sum10 = vle32_v_f32m1(outptr + 4 * 2, vl); - _sum11 = vle32_v_f32m1(outptr + 4 * 3, vl); - _sum20 = vle32_v_f32m1(outptr + 4 * 4, vl); - _sum21 = vle32_v_f32m1(outptr + 4 * 5, vl); - _sum30 = vle32_v_f32m1(outptr + 4 * 6, vl); - _sum31 = vle32_v_f32m1(outptr + 4 * 7, vl); - _sum40 = vle32_v_f32m1(outptr + 4 * 8, vl); - _sum41 = vle32_v_f32m1(outptr + 4 * 9, vl); - _sum50 = vle32_v_f32m1(outptr + 4 * 10, vl); - _sum51 = vle32_v_f32m1(outptr + 4 * 11, vl); - _sum60 = vle32_v_f32m1(outptr + 4 * 12, vl); - _sum61 = vle32_v_f32m1(outptr + 4 * 13, vl); - _sum70 = vle32_v_f32m1(outptr + 4 * 14, vl); - _sum71 = vle32_v_f32m1(outptr + 4 * 15, vl); - _sum80 = vle32_v_f32m1(outptr + 4 * 16, vl); - _sum81 = vle32_v_f32m1(outptr + 4 * 17, vl); - _sum90 = vle32_v_f32m1(outptr + 4 * 18, vl); - _sum91 = vle32_v_f32m1(outptr + 4 * 19, vl); - _suma0 = vle32_v_f32m1(outptr + 4 * 20, vl); - _suma1 = vle32_v_f32m1(outptr + 4 * 21, vl); - _sumb0 = vle32_v_f32m1(outptr + 4 * 22, vl); - _sumb1 = vle32_v_f32m1(outptr + 4 * 23, vl); - } - - const float* pA = pAT; - int kk = 0; - for (; kk + 3 < max_kk; kk += 4) - { - vfloat32m1_t _pA0 = vle32_v_f32m1(pA, vl); - vfloat32m1_t _pA1 = vle32_v_f32m1(pA + 4, vl); - - _sum00 = vfmadd_vf_f32m1(_pA0, pB[0], _sum00, vl); - _sum01 = vfmadd_vf_f32m1(_pA1, pB[0], _sum01, vl); - _sum10 = vfmadd_vf_f32m1(_pA0, pB[1], _sum10, vl); - _sum11 = vfmadd_vf_f32m1(_pA1, pB[1], _sum11, vl); - _sum20 = vfmadd_vf_f32m1(_pA0, pB[2], _sum20, vl); - _sum21 = vfmadd_vf_f32m1(_pA1, pB[2], _sum21, vl); - _sum30 = vfmadd_vf_f32m1(_pA0, pB[3], _sum30, vl); - _sum31 = vfmadd_vf_f32m1(_pA1, pB[3], _sum31, vl); - _sum40 = vfmadd_vf_f32m1(_pA0, pB[4], _sum40, vl); - _sum41 = vfmadd_vf_f32m1(_pA1, pB[4], _sum41, vl); - _sum50 = vfmadd_vf_f32m1(_pA0, pB[5], _sum50, vl); - _sum51 = vfmadd_vf_f32m1(_pA1, pB[5], _sum51, vl); - _sum60 = vfmadd_vf_f32m1(_pA0, pB[6], _sum60, vl); - _sum61 = vfmadd_vf_f32m1(_pA1, pB[6], _sum61, vl); - _sum70 = vfmadd_vf_f32m1(_pA0, pB[7], _sum70, vl); - _sum71 = vfmadd_vf_f32m1(_pA1, pB[7], _sum71, vl); - _sum80 = vfmadd_vf_f32m1(_pA0, pB[8], _sum80, vl); - _sum81 = vfmadd_vf_f32m1(_pA1, pB[8], _sum81, vl); - _sum90 = vfmadd_vf_f32m1(_pA0, pB[9], _sum90, vl); - _sum91 = vfmadd_vf_f32m1(_pA1, pB[9], _sum91, vl); - _suma0 = vfmadd_vf_f32m1(_pA0, pB[10], _suma0, vl); - _suma1 = vfmadd_vf_f32m1(_pA1, pB[10], _suma1, vl); - _sumb0 = vfmadd_vf_f32m1(_pA0, pB[11], _sumb0, vl); - _sumb1 = vfmadd_vf_f32m1(_pA1, pB[11], _sumb1, vl); - - pA += 8; - pB += 12; - - _pA0 = vle32_v_f32m1(pA, vl); - _pA1 = vle32_v_f32m1(pA + 4, vl); - - _sum00 = vfmadd_vf_f32m1(_pA0, pB[0], _sum00, vl); - _sum01 = vfmadd_vf_f32m1(_pA1, pB[0], _sum01, vl); - _sum10 = vfmadd_vf_f32m1(_pA0, pB[1], _sum10, vl); - _sum11 = vfmadd_vf_f32m1(_pA1, pB[1], _sum11, vl); - _sum20 = vfmadd_vf_f32m1(_pA0, pB[2], _sum20, vl); - _sum21 = vfmadd_vf_f32m1(_pA1, pB[2], _sum21, vl); - _sum30 = vfmadd_vf_f32m1(_pA0, pB[3], _sum30, vl); - _sum31 = vfmadd_vf_f32m1(_pA1, pB[3], _sum31, vl); - _sum40 = vfmadd_vf_f32m1(_pA0, pB[4], _sum40, vl); - _sum41 = vfmadd_vf_f32m1(_pA1, pB[4], _sum41, vl); - _sum50 = vfmadd_vf_f32m1(_pA0, pB[5], _sum50, vl); - _sum51 = vfmadd_vf_f32m1(_pA1, pB[5], _sum51, vl); - _sum60 = vfmadd_vf_f32m1(_pA0, pB[6], _sum60, vl); - _sum61 = vfmadd_vf_f32m1(_pA1, pB[6], _sum61, vl); - _sum70 = vfmadd_vf_f32m1(_pA0, pB[7], _sum70, vl); - _sum71 = vfmadd_vf_f32m1(_pA1, pB[7], _sum71, vl); - _sum80 = vfmadd_vf_f32m1(_pA0, pB[8], _sum80, vl); - _sum81 = vfmadd_vf_f32m1(_pA1, pB[8], _sum81, vl); - _sum90 = vfmadd_vf_f32m1(_pA0, pB[9], _sum90, vl); - _sum91 = vfmadd_vf_f32m1(_pA1, pB[9], _sum91, vl); - _suma0 = vfmadd_vf_f32m1(_pA0, pB[10], _suma0, vl); - _suma1 = vfmadd_vf_f32m1(_pA1, pB[10], _suma1, vl); - _sumb0 = vfmadd_vf_f32m1(_pA0, pB[11], _sumb0, vl); - _sumb1 = vfmadd_vf_f32m1(_pA1, pB[11], _sumb1, vl); - - pA += 8; - pB += 12; - - _pA0 = vle32_v_f32m1(pA, vl); - _pA1 = vle32_v_f32m1(pA + 4, vl); - - _sum00 = vfmadd_vf_f32m1(_pA0, pB[0], _sum00, vl); - _sum01 = vfmadd_vf_f32m1(_pA1, pB[0], _sum01, vl); - _sum10 = vfmadd_vf_f32m1(_pA0, pB[1], _sum10, vl); - _sum11 = vfmadd_vf_f32m1(_pA1, pB[1], _sum11, vl); - _sum20 = vfmadd_vf_f32m1(_pA0, pB[2], _sum20, vl); - _sum21 = vfmadd_vf_f32m1(_pA1, pB[2], _sum21, vl); - _sum30 = vfmadd_vf_f32m1(_pA0, pB[3], _sum30, vl); - _sum31 = vfmadd_vf_f32m1(_pA1, pB[3], _sum31, vl); - _sum40 = vfmadd_vf_f32m1(_pA0, pB[4], _sum40, vl); - _sum41 = vfmadd_vf_f32m1(_pA1, pB[4], _sum41, vl); - _sum50 = vfmadd_vf_f32m1(_pA0, pB[5], _sum50, vl); - _sum51 = vfmadd_vf_f32m1(_pA1, pB[5], _sum51, vl); - _sum60 = vfmadd_vf_f32m1(_pA0, pB[6], _sum60, vl); - _sum61 = vfmadd_vf_f32m1(_pA1, pB[6], _sum61, vl); - _sum70 = vfmadd_vf_f32m1(_pA0, pB[7], _sum70, vl); - _sum71 = vfmadd_vf_f32m1(_pA1, pB[7], _sum71, vl); - _sum80 = vfmadd_vf_f32m1(_pA0, pB[8], _sum80, vl); - _sum81 = vfmadd_vf_f32m1(_pA1, pB[8], _sum81, vl); - _sum90 = vfmadd_vf_f32m1(_pA0, pB[9], _sum90, vl); - _sum91 = vfmadd_vf_f32m1(_pA1, pB[9], _sum91, vl); - _suma0 = vfmadd_vf_f32m1(_pA0, pB[10], _suma0, vl); - _suma1 = vfmadd_vf_f32m1(_pA1, pB[10], _suma1, vl); - _sumb0 = vfmadd_vf_f32m1(_pA0, pB[11], _sumb0, vl); - _sumb1 = vfmadd_vf_f32m1(_pA1, pB[11], _sumb1, vl); - - pA += 8; - pB += 12; - - _pA0 = vle32_v_f32m1(pA, vl); - _pA1 = vle32_v_f32m1(pA + 4, vl); - - _sum00 = vfmadd_vf_f32m1(_pA0, pB[0], _sum00, vl); - _sum01 = vfmadd_vf_f32m1(_pA1, pB[0], _sum01, vl); - _sum10 = vfmadd_vf_f32m1(_pA0, pB[1], _sum10, vl); - _sum11 = vfmadd_vf_f32m1(_pA1, pB[1], _sum11, vl); - _sum20 = vfmadd_vf_f32m1(_pA0, pB[2], _sum20, vl); - _sum21 = vfmadd_vf_f32m1(_pA1, pB[2], _sum21, vl); - _sum30 = vfmadd_vf_f32m1(_pA0, pB[3], _sum30, vl); - _sum31 = vfmadd_vf_f32m1(_pA1, pB[3], _sum31, vl); - _sum40 = vfmadd_vf_f32m1(_pA0, pB[4], _sum40, vl); - _sum41 = vfmadd_vf_f32m1(_pA1, pB[4], _sum41, vl); - _sum50 = vfmadd_vf_f32m1(_pA0, pB[5], _sum50, vl); - _sum51 = vfmadd_vf_f32m1(_pA1, pB[5], _sum51, vl); - _sum60 = vfmadd_vf_f32m1(_pA0, pB[6], _sum60, vl); - _sum61 = vfmadd_vf_f32m1(_pA1, pB[6], _sum61, vl); - _sum70 = vfmadd_vf_f32m1(_pA0, pB[7], _sum70, vl); - _sum71 = vfmadd_vf_f32m1(_pA1, pB[7], _sum71, vl); - _sum80 = vfmadd_vf_f32m1(_pA0, pB[8], _sum80, vl); - _sum81 = vfmadd_vf_f32m1(_pA1, pB[8], _sum81, vl); - _sum90 = vfmadd_vf_f32m1(_pA0, pB[9], _sum90, vl); - _sum91 = vfmadd_vf_f32m1(_pA1, pB[9], _sum91, vl); - _suma0 = vfmadd_vf_f32m1(_pA0, pB[10], _suma0, vl); - _suma1 = vfmadd_vf_f32m1(_pA1, pB[10], _suma1, vl); - _sumb0 = vfmadd_vf_f32m1(_pA0, pB[11], _sumb0, vl); - _sumb1 = vfmadd_vf_f32m1(_pA1, pB[11], _sumb1, vl); - - pA += 8; - pB += 12; - } - for (; kk < max_kk; kk += 1) - { - vfloat32m1_t _pA0 = vle32_v_f32m1(pA, vl); - vfloat32m1_t _pA1 = vle32_v_f32m1(pA + 4, vl); - - _sum00 = vfmadd_vf_f32m1(_pA0, pB[0], _sum00, vl); - _sum01 = vfmadd_vf_f32m1(_pA1, pB[0], _sum01, vl); - _sum10 = vfmadd_vf_f32m1(_pA0, pB[1], _sum10, vl); - _sum11 = vfmadd_vf_f32m1(_pA1, pB[1], _sum11, vl); - _sum20 = vfmadd_vf_f32m1(_pA0, pB[2], _sum20, vl); - _sum21 = vfmadd_vf_f32m1(_pA1, pB[2], _sum21, vl); - _sum30 = vfmadd_vf_f32m1(_pA0, pB[3], _sum30, vl); - _sum31 = vfmadd_vf_f32m1(_pA1, pB[3], _sum31, vl); - _sum40 = vfmadd_vf_f32m1(_pA0, pB[4], _sum40, vl); - _sum41 = vfmadd_vf_f32m1(_pA1, pB[4], _sum41, vl); - _sum50 = vfmadd_vf_f32m1(_pA0, pB[5], _sum50, vl); - _sum51 = vfmadd_vf_f32m1(_pA1, pB[5], _sum51, vl); - _sum60 = vfmadd_vf_f32m1(_pA0, pB[6], _sum60, vl); - _sum61 = vfmadd_vf_f32m1(_pA1, pB[6], _sum61, vl); - _sum70 = vfmadd_vf_f32m1(_pA0, pB[7], _sum70, vl); - _sum71 = vfmadd_vf_f32m1(_pA1, pB[7], _sum71, vl); - _sum80 = vfmadd_vf_f32m1(_pA0, pB[8], _sum80, vl); - _sum81 = vfmadd_vf_f32m1(_pA1, pB[8], _sum81, vl); - _sum90 = vfmadd_vf_f32m1(_pA0, pB[9], _sum90, vl); - _sum91 = vfmadd_vf_f32m1(_pA1, pB[9], _sum91, vl); - _suma0 = vfmadd_vf_f32m1(_pA0, pB[10], _suma0, vl); - _suma1 = vfmadd_vf_f32m1(_pA1, pB[10], _suma1, vl); - _sumb0 = vfmadd_vf_f32m1(_pA0, pB[11], _sumb0, vl); - _sumb1 = vfmadd_vf_f32m1(_pA1, pB[11], _sumb1, vl); - - pA += 8; - pB += 12; - } - - if (k_end) - { - if (out_elempack == 4) - { - vse32_v_f32m1(outptr0, _sum00, vl); - vse32_v_f32m1(outptr0 + 4, _sum10, vl); - vse32_v_f32m1(outptr0 + 4 * 2, _sum20, vl); - vse32_v_f32m1(outptr0 + 4 * 3, _sum30, vl); - vse32_v_f32m1(outptr0 + 4 * 4, _sum40, vl); - vse32_v_f32m1(outptr0 + 4 * 5, _sum50, vl); - vse32_v_f32m1(outptr0 + 4 * 6, _sum60, vl); - vse32_v_f32m1(outptr0 + 4 * 7, _sum70, vl); - vse32_v_f32m1(outptr0 + 4 * 8, _sum80, vl); - vse32_v_f32m1(outptr0 + 4 * 9, _sum90, vl); - vse32_v_f32m1(outptr0 + 4 * 10, _suma0, vl); - vse32_v_f32m1(outptr0 + 4 * 11, _sumb0, vl); - - vse32_v_f32m1(outptr0 + out_hstep * 4, _sum01, vl); - vse32_v_f32m1(outptr0 + out_hstep * 4 + 4, _sum11, vl); - vse32_v_f32m1(outptr0 + out_hstep * 4 + 4 * 2, _sum21, vl); - vse32_v_f32m1(outptr0 + out_hstep * 4 + 4 * 3, _sum31, vl); - vse32_v_f32m1(outptr0 + out_hstep * 4 + 4 * 4, _sum41, vl); - vse32_v_f32m1(outptr0 + out_hstep * 4 + 4 * 5, _sum51, vl); - vse32_v_f32m1(outptr0 + out_hstep * 4 + 4 * 6, _sum61, vl); - vse32_v_f32m1(outptr0 + out_hstep * 4 + 4 * 7, _sum71, vl); - vse32_v_f32m1(outptr0 + out_hstep * 4 + 4 * 8, _sum81, vl); - vse32_v_f32m1(outptr0 + out_hstep * 4 + 4 * 9, _sum91, vl); - vse32_v_f32m1(outptr0 + out_hstep * 4 + 4 * 10, _suma1, vl); - vse32_v_f32m1(outptr0 + out_hstep * 4 + 4 * 11, _sumb1, vl); - - outptr0 += 48; - } - if (out_elempack == 1) - { - transpose8x12_ps(_sum00, _sum01, _sum10, _sum11, _sum20, _sum21, _sum30, _sum31, _sum40, _sum41, _sum50, _sum51, _sum60, _sum61, _sum70, _sum71, _sum80, _sum81, _sum90, _sum91, _suma0, _suma1, _sumb0, _sumb1, vl); - - vse32_v_f32m1(outptr0, _sum00, vl); - vse32_v_f32m1(outptr0 + 4, _sum01, vl); - vse32_v_f32m1(outptr0 + 8, _sum10, vl); - vse32_v_f32m1(outptr0 + out_hstep, _sum11, vl); - vse32_v_f32m1(outptr0 + out_hstep + 4, _sum20, vl); - vse32_v_f32m1(outptr0 + out_hstep + 8, _sum21, vl); - vse32_v_f32m1(outptr0 + out_hstep * 2, _sum30, vl); - vse32_v_f32m1(outptr0 + out_hstep * 2 + 4, _sum31, vl); - vse32_v_f32m1(outptr0 + out_hstep * 2 + 8, _sum40, vl); - vse32_v_f32m1(outptr0 + out_hstep * 3, _sum41, vl); - vse32_v_f32m1(outptr0 + out_hstep * 3 + 4, _sum50, vl); - vse32_v_f32m1(outptr0 + out_hstep * 3 + 8, _sum51, vl); - vse32_v_f32m1(outptr0 + out_hstep * 4, _sum60, vl); - vse32_v_f32m1(outptr0 + out_hstep * 4 + 4, _sum61, vl); - vse32_v_f32m1(outptr0 + out_hstep * 4 + 8, _sum70, vl); - vse32_v_f32m1(outptr0 + out_hstep * 5, _sum71, vl); - vse32_v_f32m1(outptr0 + out_hstep * 5 + 4, _sum80, vl); - vse32_v_f32m1(outptr0 + out_hstep * 5 + 8, _sum81, vl); - vse32_v_f32m1(outptr0 + out_hstep * 6, _sum90, vl); - vse32_v_f32m1(outptr0 + out_hstep * 6 + 4, _sum91, vl); - vse32_v_f32m1(outptr0 + out_hstep * 6 + 8, _suma0, vl); - vse32_v_f32m1(outptr0 + out_hstep * 7, _suma1, vl); - vse32_v_f32m1(outptr0 + out_hstep * 7 + 4, _sumb0, vl); - vse32_v_f32m1(outptr0 + out_hstep * 7 + 8, _sumb1, vl); - - outptr0 += 12; - } - } - else - { - vse32_v_f32m1(outptr, _sum00, vl); - vse32_v_f32m1(outptr + 4, _sum01, vl); - vse32_v_f32m1(outptr + 4 * 2, _sum10, vl); - vse32_v_f32m1(outptr + 4 * 3, _sum11, vl); - vse32_v_f32m1(outptr + 4 * 4, _sum20, vl); - vse32_v_f32m1(outptr + 4 * 5, _sum21, vl); - vse32_v_f32m1(outptr + 4 * 6, _sum30, vl); - vse32_v_f32m1(outptr + 4 * 7, _sum31, vl); - vse32_v_f32m1(outptr + 4 * 8, _sum40, vl); - vse32_v_f32m1(outptr + 4 * 9, _sum41, vl); - vse32_v_f32m1(outptr + 4 * 10, _sum50, vl); - vse32_v_f32m1(outptr + 4 * 11, _sum51, vl); - vse32_v_f32m1(outptr + 4 * 12, _sum60, vl); - vse32_v_f32m1(outptr + 4 * 13, _sum61, vl); - vse32_v_f32m1(outptr + 4 * 14, _sum70, vl); - vse32_v_f32m1(outptr + 4 * 15, _sum71, vl); - vse32_v_f32m1(outptr + 4 * 16, _sum80, vl); - vse32_v_f32m1(outptr + 4 * 17, _sum81, vl); - vse32_v_f32m1(outptr + 4 * 18, _sum90, vl); - vse32_v_f32m1(outptr + 4 * 19, _sum91, vl); - vse32_v_f32m1(outptr + 4 * 20, _suma0, vl); - vse32_v_f32m1(outptr + 4 * 21, _suma1, vl); - vse32_v_f32m1(outptr + 4 * 22, _sumb0, vl); - vse32_v_f32m1(outptr + 4 * 23, _sumb1, vl); - } - - outptr += 96; - } - for (; jj + 7 < max_jj; jj += 8) - { - vfloat32m1_t _sum00; - vfloat32m1_t _sum01; - vfloat32m1_t _sum10; - vfloat32m1_t _sum11; - vfloat32m1_t _sum20; - vfloat32m1_t _sum21; - vfloat32m1_t _sum30; - vfloat32m1_t _sum31; - vfloat32m1_t _sum40; - vfloat32m1_t _sum41; - vfloat32m1_t _sum50; - vfloat32m1_t _sum51; - vfloat32m1_t _sum60; - vfloat32m1_t _sum61; - vfloat32m1_t _sum70; - vfloat32m1_t _sum71; - - if (k == 0) - { - _sum00 = vfmv_v_f_f32m1(0.f, vl); - _sum01 = vfmv_v_f_f32m1(0.f, vl); - _sum10 = vfmv_v_f_f32m1(0.f, vl); - _sum11 = vfmv_v_f_f32m1(0.f, vl); - _sum20 = vfmv_v_f_f32m1(0.f, vl); - _sum21 = vfmv_v_f_f32m1(0.f, vl); - _sum30 = vfmv_v_f_f32m1(0.f, vl); - _sum31 = vfmv_v_f_f32m1(0.f, vl); - _sum40 = vfmv_v_f_f32m1(0.f, vl); - _sum41 = vfmv_v_f_f32m1(0.f, vl); - _sum50 = vfmv_v_f_f32m1(0.f, vl); - _sum51 = vfmv_v_f_f32m1(0.f, vl); - _sum60 = vfmv_v_f_f32m1(0.f, vl); - _sum61 = vfmv_v_f_f32m1(0.f, vl); - _sum70 = vfmv_v_f_f32m1(0.f, vl); - _sum71 = vfmv_v_f_f32m1(0.f, vl); - - if (pC) - { - if (broadcast_type_C == 0) - { - _sum00 = vfmv_v_f_f32m1(pC[0], vl); - _sum01 = vfmv_v_f_f32m1(pC[0], vl); - _sum10 = vfmv_v_f_f32m1(pC[0], vl); - _sum11 = vfmv_v_f_f32m1(pC[0], vl); - _sum20 = vfmv_v_f_f32m1(pC[0], vl); - _sum21 = vfmv_v_f_f32m1(pC[0], vl); - _sum30 = vfmv_v_f_f32m1(pC[0], vl); - _sum31 = vfmv_v_f_f32m1(pC[0], vl); - _sum40 = vfmv_v_f_f32m1(pC[0], vl); - _sum41 = vfmv_v_f_f32m1(pC[0], vl); - _sum50 = vfmv_v_f_f32m1(pC[0], vl); - _sum51 = vfmv_v_f_f32m1(pC[0], vl); - _sum60 = vfmv_v_f_f32m1(pC[0], vl); - _sum61 = vfmv_v_f_f32m1(pC[0], vl); - _sum70 = vfmv_v_f_f32m1(pC[0], vl); - _sum71 = vfmv_v_f_f32m1(pC[0], vl); - } - if (broadcast_type_C == 1 || broadcast_type_C == 2) - { - _sum00 = vle32_v_f32m1(pC, vl); - _sum01 = vle32_v_f32m1(pC + 4, vl); - _sum10 = _sum00; - _sum11 = _sum01; - _sum20 = _sum00; - _sum21 = _sum01; - _sum30 = _sum00; - _sum31 = _sum01; - _sum40 = _sum00; - _sum41 = _sum01; - _sum50 = _sum00; - _sum51 = _sum01; - _sum60 = _sum00; - _sum61 = _sum01; - _sum70 = _sum00; - _sum71 = _sum01; - } - if (broadcast_type_C == 3) - { - _sum00 = vle32_v_f32m1(pC, vl); - _sum01 = vle32_v_f32m1(pC + 4 * 1, vl); - _sum10 = vle32_v_f32m1(pC + 4 * 2, vl); - _sum11 = vle32_v_f32m1(pC + 4 * 3, vl); - _sum20 = vle32_v_f32m1(pC + 4 * 4, vl); - _sum21 = vle32_v_f32m1(pC + 4 * 5, vl); - _sum30 = vle32_v_f32m1(pC + 4 * 6, vl); - _sum31 = vle32_v_f32m1(pC + 4 * 7, vl); - _sum40 = vle32_v_f32m1(pC + 4 * 8, vl); - _sum41 = vle32_v_f32m1(pC + 4 * 9, vl); - _sum50 = vle32_v_f32m1(pC + 4 * 10, vl); - _sum51 = vle32_v_f32m1(pC + 4 * 11, vl); - _sum60 = vle32_v_f32m1(pC + 4 * 12, vl); - _sum61 = vle32_v_f32m1(pC + 4 * 13, vl); - _sum70 = vle32_v_f32m1(pC + 4 * 14, vl); - _sum71 = vle32_v_f32m1(pC + 4 * 15, vl); - pC += 64; - } - if (broadcast_type_C == 4) - { - _sum00 = vfmv_v_f_f32m1(pC[0], vl); - _sum10 = vfmv_v_f_f32m1(pC[1], vl); - _sum20 = vfmv_v_f_f32m1(pC[2], vl); - _sum30 = vfmv_v_f_f32m1(pC[3], vl); - _sum40 = vfmv_v_f_f32m1(pC[4], vl); - _sum50 = vfmv_v_f_f32m1(pC[5], vl); - _sum60 = vfmv_v_f_f32m1(pC[6], vl); - _sum70 = vfmv_v_f_f32m1(pC[7], vl); - _sum01 = _sum00; - _sum11 = _sum10; - _sum21 = _sum20; - _sum31 = _sum30; - _sum41 = _sum40; - _sum51 = _sum50; - _sum61 = _sum60; - _sum71 = _sum70; - pC += 8; - } - } - } - else - { - _sum00 = vle32_v_f32m1(outptr, vl); - _sum01 = vle32_v_f32m1(outptr + 4 * 1, vl); - _sum10 = vle32_v_f32m1(outptr + 4 * 2, vl); - _sum11 = vle32_v_f32m1(outptr + 4 * 3, vl); - _sum20 = vle32_v_f32m1(outptr + 4 * 4, vl); - _sum21 = vle32_v_f32m1(outptr + 4 * 5, vl); - _sum30 = vle32_v_f32m1(outptr + 4 * 6, vl); - _sum31 = vle32_v_f32m1(outptr + 4 * 7, vl); - _sum40 = vle32_v_f32m1(outptr + 4 * 8, vl); - _sum41 = vle32_v_f32m1(outptr + 4 * 9, vl); - _sum50 = vle32_v_f32m1(outptr + 4 * 10, vl); - _sum51 = vle32_v_f32m1(outptr + 4 * 11, vl); - _sum60 = vle32_v_f32m1(outptr + 4 * 12, vl); - _sum61 = vle32_v_f32m1(outptr + 4 * 13, vl); - _sum70 = vle32_v_f32m1(outptr + 4 * 14, vl); - _sum71 = vle32_v_f32m1(outptr + 4 * 15, vl); - } - - const float* pA = pAT; - int kk = 0; - for (; kk < max_kk; kk += 1) - { - vfloat32m1_t _pA0 = vle32_v_f32m1(pA, vl); - vfloat32m1_t _pA1 = vle32_v_f32m1(pA + 4, vl); - - _sum00 = vfmadd_vf_f32m1(_pA0, pB[0], _sum00, vl); - _sum01 = vfmadd_vf_f32m1(_pA1, pB[0], _sum01, vl); - _sum10 = vfmadd_vf_f32m1(_pA0, pB[1], _sum10, vl); - _sum11 = vfmadd_vf_f32m1(_pA1, pB[1], _sum11, vl); - _sum20 = vfmadd_vf_f32m1(_pA0, pB[2], _sum20, vl); - _sum21 = vfmadd_vf_f32m1(_pA1, pB[2], _sum21, vl); - _sum30 = vfmadd_vf_f32m1(_pA0, pB[3], _sum30, vl); - _sum31 = vfmadd_vf_f32m1(_pA1, pB[3], _sum31, vl); - _sum40 = vfmadd_vf_f32m1(_pA0, pB[4], _sum40, vl); - _sum41 = vfmadd_vf_f32m1(_pA1, pB[4], _sum41, vl); - _sum50 = vfmadd_vf_f32m1(_pA0, pB[5], _sum50, vl); - _sum51 = vfmadd_vf_f32m1(_pA1, pB[5], _sum51, vl); - _sum60 = vfmadd_vf_f32m1(_pA0, pB[6], _sum60, vl); - _sum61 = vfmadd_vf_f32m1(_pA1, pB[6], _sum61, vl); - _sum70 = vfmadd_vf_f32m1(_pA0, pB[7], _sum70, vl); - _sum71 = vfmadd_vf_f32m1(_pA1, pB[7], _sum71, vl); - - pA += 8; - pB += 8; - } - - if (k_end) - { - if (out_elempack == 4) - { - vse32_v_f32m1(outptr0, _sum00, vl); - vse32_v_f32m1(outptr0 + 4, _sum10, vl); - vse32_v_f32m1(outptr0 + 4 * 2, _sum20, vl); - vse32_v_f32m1(outptr0 + 4 * 3, _sum30, vl); - vse32_v_f32m1(outptr0 + 4 * 4, _sum40, vl); - vse32_v_f32m1(outptr0 + 4 * 5, _sum50, vl); - vse32_v_f32m1(outptr0 + 4 * 6, _sum60, vl); - vse32_v_f32m1(outptr0 + 4 * 7, _sum70, vl); - - vse32_v_f32m1(outptr0 + out_hstep * 4, _sum01, vl); - vse32_v_f32m1(outptr0 + out_hstep * 4 + 4, _sum11, vl); - vse32_v_f32m1(outptr0 + out_hstep * 4 + 4 * 2, _sum21, vl); - vse32_v_f32m1(outptr0 + out_hstep * 4 + 4 * 3, _sum31, vl); - vse32_v_f32m1(outptr0 + out_hstep * 4 + 4 * 4, _sum41, vl); - vse32_v_f32m1(outptr0 + out_hstep * 4 + 4 * 5, _sum51, vl); - vse32_v_f32m1(outptr0 + out_hstep * 4 + 4 * 6, _sum61, vl); - vse32_v_f32m1(outptr0 + out_hstep * 4 + 4 * 7, _sum71, vl); - - outptr0 += 32; - } - if (out_elempack == 1) - { - transpose8x8_ps(_sum00, _sum01, _sum10, _sum11, _sum20, _sum21, _sum30, _sum31, _sum40, _sum41, _sum50, _sum51, _sum60, _sum61, _sum70, _sum71, vl); - - vse32_v_f32m1(outptr0, _sum00, vl); - vse32_v_f32m1(outptr0 + 4, _sum01, vl); - vse32_v_f32m1(outptr0 + out_hstep, _sum10, vl); - vse32_v_f32m1(outptr0 + out_hstep + 4, _sum11, vl); - vse32_v_f32m1(outptr0 + out_hstep * 2, _sum20, vl); - vse32_v_f32m1(outptr0 + out_hstep * 2 + 4, _sum21, vl); - vse32_v_f32m1(outptr0 + out_hstep * 3, _sum30, vl); - vse32_v_f32m1(outptr0 + out_hstep * 3 + 4, _sum31, vl); - vse32_v_f32m1(outptr0 + out_hstep * 4, _sum40, vl); - vse32_v_f32m1(outptr0 + out_hstep * 4 + 4, _sum41, vl); - vse32_v_f32m1(outptr0 + out_hstep * 5, _sum50, vl); - vse32_v_f32m1(outptr0 + out_hstep * 5 + 4, _sum51, vl); - vse32_v_f32m1(outptr0 + out_hstep * 6, _sum60, vl); - vse32_v_f32m1(outptr0 + out_hstep * 6 + 4, _sum61, vl); - vse32_v_f32m1(outptr0 + out_hstep * 7, _sum70, vl); - vse32_v_f32m1(outptr0 + out_hstep * 7 + 4, _sum71, vl); - - outptr0 += 8; - } - } - else - { - vse32_v_f32m1(outptr, _sum00, vl); - vse32_v_f32m1(outptr + 4, _sum01, vl); - vse32_v_f32m1(outptr + 4 * 2, _sum10, vl); - vse32_v_f32m1(outptr + 4 * 3, _sum11, vl); - vse32_v_f32m1(outptr + 4 * 4, _sum20, vl); - vse32_v_f32m1(outptr + 4 * 5, _sum21, vl); - vse32_v_f32m1(outptr + 4 * 6, _sum30, vl); - vse32_v_f32m1(outptr + 4 * 7, _sum31, vl); - vse32_v_f32m1(outptr + 4 * 8, _sum40, vl); - vse32_v_f32m1(outptr + 4 * 9, _sum41, vl); - vse32_v_f32m1(outptr + 4 * 10, _sum50, vl); - vse32_v_f32m1(outptr + 4 * 11, _sum51, vl); - vse32_v_f32m1(outptr + 4 * 12, _sum60, vl); - vse32_v_f32m1(outptr + 4 * 13, _sum61, vl); - vse32_v_f32m1(outptr + 4 * 14, _sum70, vl); - vse32_v_f32m1(outptr + 4 * 15, _sum71, vl); - } - - outptr += 64; - } - for (; jj + 3 < max_jj; jj += 4) - { - vfloat32m1_t _sum00; - vfloat32m1_t _sum01; - vfloat32m1_t _sum10; - vfloat32m1_t _sum11; - vfloat32m1_t _sum20; - vfloat32m1_t _sum21; - vfloat32m1_t _sum30; - vfloat32m1_t _sum31; - - if (k == 0) - { - _sum00 = vfmv_v_f_f32m1(0.f, vl); - _sum01 = vfmv_v_f_f32m1(0.f, vl); - _sum10 = vfmv_v_f_f32m1(0.f, vl); - _sum11 = vfmv_v_f_f32m1(0.f, vl); - _sum20 = vfmv_v_f_f32m1(0.f, vl); - _sum21 = vfmv_v_f_f32m1(0.f, vl); - _sum30 = vfmv_v_f_f32m1(0.f, vl); - _sum31 = vfmv_v_f_f32m1(0.f, vl); - - if (pC) - { - if (broadcast_type_C == 0) - { - _sum00 = vfmv_v_f_f32m1(pC[0], vl); - _sum01 = vfmv_v_f_f32m1(pC[0], vl); - _sum10 = vfmv_v_f_f32m1(pC[0], vl); - _sum11 = vfmv_v_f_f32m1(pC[0], vl); - _sum20 = vfmv_v_f_f32m1(pC[0], vl); - _sum21 = vfmv_v_f_f32m1(pC[0], vl); - _sum30 = vfmv_v_f_f32m1(pC[0], vl); - _sum31 = vfmv_v_f_f32m1(pC[0], vl); - } - if (broadcast_type_C == 1 || broadcast_type_C == 2) - { - _sum00 = vle32_v_f32m1(pC, vl); - _sum01 = vle32_v_f32m1(pC + 4, vl); - _sum10 = _sum00; - _sum11 = _sum01; - _sum20 = _sum00; - _sum21 = _sum01; - _sum30 = _sum00; - _sum31 = _sum01; - } - if (broadcast_type_C == 3) - { - _sum00 = vle32_v_f32m1(pC, vl); - _sum01 = vle32_v_f32m1(pC + 4 * 1, vl); - _sum10 = vle32_v_f32m1(pC + 4 * 2, vl); - _sum11 = vle32_v_f32m1(pC + 4 * 3, vl); - _sum20 = vle32_v_f32m1(pC + 4 * 4, vl); - _sum21 = vle32_v_f32m1(pC + 4 * 5, vl); - _sum30 = vle32_v_f32m1(pC + 4 * 6, vl); - _sum31 = vle32_v_f32m1(pC + 4 * 7, vl); - pC += 32; - } - if (broadcast_type_C == 4) - { - _sum00 = vfmv_v_f_f32m1(pC[0], vl); - _sum10 = vfmv_v_f_f32m1(pC[1], vl); - _sum20 = vfmv_v_f_f32m1(pC[2], vl); - _sum30 = vfmv_v_f_f32m1(pC[3], vl); - _sum01 = _sum00; - _sum11 = _sum10; - _sum21 = _sum20; - _sum31 = _sum30; - pC += 4; - } - } - } - else - { - _sum00 = vle32_v_f32m1(outptr, vl); - _sum01 = vle32_v_f32m1(outptr + 4 * 1, vl); - _sum10 = vle32_v_f32m1(outptr + 4 * 2, vl); - _sum11 = vle32_v_f32m1(outptr + 4 * 3, vl); - _sum20 = vle32_v_f32m1(outptr + 4 * 4, vl); - _sum21 = vle32_v_f32m1(outptr + 4 * 5, vl); - _sum30 = vle32_v_f32m1(outptr + 4 * 6, vl); - _sum31 = vle32_v_f32m1(outptr + 4 * 7, vl); - } - - const float* pA = pAT; - int kk = 0; - for (; kk < max_kk; kk += 1) - { - vfloat32m1_t _pA0 = vle32_v_f32m1(pA, vl); - vfloat32m1_t _pA1 = vle32_v_f32m1(pA + 4, vl); - - _sum00 = vfmadd_vf_f32m1(_pA0, pB[0], _sum00, vl); - _sum01 = vfmadd_vf_f32m1(_pA1, pB[0], _sum01, vl); - _sum10 = vfmadd_vf_f32m1(_pA0, pB[1], _sum10, vl); - _sum11 = vfmadd_vf_f32m1(_pA1, pB[1], _sum11, vl); - _sum20 = vfmadd_vf_f32m1(_pA0, pB[2], _sum20, vl); - _sum21 = vfmadd_vf_f32m1(_pA1, pB[2], _sum21, vl); - _sum30 = vfmadd_vf_f32m1(_pA0, pB[3], _sum30, vl); - _sum31 = vfmadd_vf_f32m1(_pA1, pB[3], _sum31, vl); - - pA += 8; - pB += 4; - } - - if (k_end) - { - if (out_elempack == 4) - { - vse32_v_f32m1(outptr0, _sum00, vl); - vse32_v_f32m1(outptr0 + 4, _sum10, vl); - vse32_v_f32m1(outptr0 + 4 * 2, _sum20, vl); - vse32_v_f32m1(outptr0 + 4 * 3, _sum30, vl); - - vse32_v_f32m1(outptr0 + out_hstep * 4, _sum01, vl); - vse32_v_f32m1(outptr0 + out_hstep * 4 + 4, _sum11, vl); - vse32_v_f32m1(outptr0 + out_hstep * 4 + 4 * 2, _sum21, vl); - vse32_v_f32m1(outptr0 + out_hstep * 4 + 4 * 3, _sum31, vl); - - outptr0 += 16; - } - if (out_elempack == 1) - { - transpose8x4_ps(_sum00, _sum01, _sum10, _sum11, _sum20, _sum21, _sum30, _sum31, vl); - - vse32_v_f32m1(outptr0, _sum00, vl); - vse32_v_f32m1(outptr0 + out_hstep * 1, _sum01, vl); - vse32_v_f32m1(outptr0 + out_hstep * 2, _sum10, vl); - vse32_v_f32m1(outptr0 + out_hstep * 3, _sum11, vl); - vse32_v_f32m1(outptr0 + out_hstep * 4, _sum20, vl); - vse32_v_f32m1(outptr0 + out_hstep * 5, _sum21, vl); - vse32_v_f32m1(outptr0 + out_hstep * 6, _sum30, vl); - vse32_v_f32m1(outptr0 + out_hstep * 7, _sum31, vl); - - outptr0 += 4; - } - } - else - { - vse32_v_f32m1(outptr, _sum00, vl); - vse32_v_f32m1(outptr + 4, _sum01, vl); - vse32_v_f32m1(outptr + 4 * 2, _sum10, vl); - vse32_v_f32m1(outptr + 4 * 3, _sum11, vl); - vse32_v_f32m1(outptr + 4 * 4, _sum20, vl); - vse32_v_f32m1(outptr + 4 * 5, _sum21, vl); - vse32_v_f32m1(outptr + 4 * 6, _sum30, vl); - vse32_v_f32m1(outptr + 4 * 7, _sum31, vl); - } - - outptr += 32; - } - for (; jj + 1 < max_jj; jj += 2) - { - vfloat32m1_t _sum00; - vfloat32m1_t _sum01; - vfloat32m1_t _sum10; - vfloat32m1_t _sum11; - - if (k == 0) - { - _sum00 = vfmv_v_f_f32m1(0.f, vl); - _sum01 = vfmv_v_f_f32m1(0.f, vl); - _sum10 = vfmv_v_f_f32m1(0.f, vl); - _sum11 = vfmv_v_f_f32m1(0.f, vl); - - if (pC) - { - if (broadcast_type_C == 0) - { - _sum00 = vfmv_v_f_f32m1(pC[0], vl); - _sum01 = vfmv_v_f_f32m1(pC[0], vl); - _sum10 = vfmv_v_f_f32m1(pC[0], vl); - _sum11 = vfmv_v_f_f32m1(pC[0], vl); - } - if (broadcast_type_C == 1 || broadcast_type_C == 2) - { - _sum00 = vle32_v_f32m1(pC, vl); - _sum01 = vle32_v_f32m1(pC + 4, vl); - _sum10 = _sum00; - _sum11 = _sum01; - } - if (broadcast_type_C == 3) - { - _sum00 = vle32_v_f32m1(pC, vl); - _sum01 = vle32_v_f32m1(pC + 4 * 1, vl); - _sum10 = vle32_v_f32m1(pC + 4 * 2, vl); - _sum11 = vle32_v_f32m1(pC + 4 * 3, vl); - pC += 16; - } - if (broadcast_type_C == 4) - { - _sum00 = vfmv_v_f_f32m1(pC[0], vl); - _sum10 = vfmv_v_f_f32m1(pC[1], vl); - _sum01 = _sum00; - _sum11 = _sum10; - pC += 2; - } - } - } - else - { - _sum00 = vle32_v_f32m1(outptr, vl); - _sum01 = vle32_v_f32m1(outptr + 4 * 1, vl); - _sum10 = vle32_v_f32m1(outptr + 4 * 2, vl); - _sum11 = vle32_v_f32m1(outptr + 4 * 3, vl); - } - - const float* pA = pAT; - int kk = 0; - for (; kk < max_kk; kk += 1) - { - vfloat32m1_t _pA0 = vle32_v_f32m1(pA, vl); - vfloat32m1_t _pA1 = vle32_v_f32m1(pA + 4, vl); - - _sum00 = vfmadd_vf_f32m1(_pA0, pB[0], _sum00, vl); - _sum01 = vfmadd_vf_f32m1(_pA1, pB[0], _sum01, vl); - _sum10 = vfmadd_vf_f32m1(_pA0, pB[1], _sum10, vl); - _sum11 = vfmadd_vf_f32m1(_pA1, pB[1], _sum11, vl); - - pA += 8; - pB += 2; - } - - if (k_end) - { - if (out_elempack == 4) - { - vse32_v_f32m1(outptr0, _sum00, vl); - vse32_v_f32m1(outptr0 + 4, _sum10, vl); - - vse32_v_f32m1(outptr0 + out_hstep * 4, _sum01, vl); - vse32_v_f32m1(outptr0 + out_hstep * 4 + 4, _sum11, vl); - outptr0 += 8; - } - if (out_elempack == 1) - { - float sum0[8]; - float sum1[8]; - vse32_v_f32m1(sum0, _sum00, vl); - vse32_v_f32m1(sum0 + 4, _sum01, vl); - vse32_v_f32m1(sum1, _sum10, vl); - vse32_v_f32m1(sum1 + 4, _sum11, vl); - - outptr0[0] = sum0[0]; - outptr0[out_hstep] = sum0[1]; - outptr0[out_hstep * 2] = sum0[2]; - outptr0[out_hstep * 3] = sum0[3]; - outptr0[out_hstep * 4] = sum0[4]; - outptr0[out_hstep * 5] = sum0[5]; - outptr0[out_hstep * 6] = sum0[6]; - outptr0[out_hstep * 7] = sum0[7]; - - outptr0[1] = sum1[0]; - outptr0[out_hstep + 1] = sum1[1]; - outptr0[out_hstep * 2 + 1] = sum1[2]; - outptr0[out_hstep * 3 + 1] = sum1[3]; - outptr0[out_hstep * 4 + 1] = sum1[4]; - outptr0[out_hstep * 5 + 1] = sum1[5]; - outptr0[out_hstep * 6 + 1] = sum1[6]; - outptr0[out_hstep * 7 + 1] = sum1[7]; - outptr0 += 2; - } - } - else - { - vse32_v_f32m1(outptr, _sum00, vl); - vse32_v_f32m1(outptr + 4, _sum01, vl); - vse32_v_f32m1(outptr + 4 * 2, _sum10, vl); - vse32_v_f32m1(outptr + 4 * 3, _sum11, vl); - } - - outptr += 16; - } - for (; jj < max_jj; jj += 1) - { - vfloat32m1_t _sum00; - vfloat32m1_t _sum01; - - if (k == 0) - { - _sum00 = vfmv_v_f_f32m1(0.f, vl); - _sum01 = vfmv_v_f_f32m1(0.f, vl); - - if (pC) - { - if (broadcast_type_C == 0) - { - _sum00 = vfmv_v_f_f32m1(pC[0], vl); - _sum01 = vfmv_v_f_f32m1(pC[0], vl); - } - if (broadcast_type_C == 1 || broadcast_type_C == 2) - { - _sum00 = vle32_v_f32m1(pC, vl); - _sum01 = vle32_v_f32m1(pC + 4, vl); - } - if (broadcast_type_C == 3) - { - _sum00 = vle32_v_f32m1(pC, vl); - _sum01 = vle32_v_f32m1(pC + 4, vl); - pC += 8; - } - if (broadcast_type_C == 4) - { - _sum00 = vfmv_v_f_f32m1(pC[0], vl); - _sum01 = _sum00; - pC += 1; - } - } - } - else - { - _sum00 = vle32_v_f32m1(outptr, vl); - _sum01 = vle32_v_f32m1(outptr + 4 * 1, vl); - } - - const float* pA = pAT; - int kk = 0; - for (; kk < max_kk; kk += 1) - { - vfloat32m1_t _pA0 = vle32_v_f32m1(pA, vl); - vfloat32m1_t _pA1 = vle32_v_f32m1(pA + 4, vl); - - vfloat32m1_t _pB = vfmv_v_f_f32m1(pB[0], vl); - - _sum00 = vfmadd_vv_f32m1(_pA0, _pB, _sum00, vl); - _sum01 = vfmadd_vv_f32m1(_pA1, _pB, _sum01, vl); - - pA += 8; - pB += 1; - } - - if (k_end) - { - if (out_elempack == 4) - { - vse32_v_f32m1(outptr0, _sum00, vl); - vse32_v_f32m1(outptr0 + out_hstep * 4, _sum01, vl); - outptr0 += 4; - } - if (out_elempack == 1) - { - float sum0[8]; - vse32_v_f32m1(sum0, _sum00, vl); - vse32_v_f32m1(sum0 + 4, _sum01, vl); - - outptr0[0] = sum0[0]; - outptr0[out_hstep * 1] = sum0[1]; - outptr0[out_hstep * 2] = sum0[2]; - outptr0[out_hstep * 3] = sum0[3]; - outptr0[out_hstep * 4] = sum0[4]; - outptr0[out_hstep * 5] = sum0[5]; - outptr0[out_hstep * 6] = sum0[6]; - outptr0[out_hstep * 7] = sum0[7]; - outptr0++; - } - } - else - { - vse32_v_f32m1(outptr, _sum00, vl); - vse32_v_f32m1(outptr + 4, _sum01, vl); - } - - outptr += 8; - } - - pAT += max_kk * 8; - } - for (; ii + 3 < max_ii; ii += 4) - { - float* outptr0 = (float*)top_blob + (i + ii) * out_hstep + j * out_elempack; - - const float* pB = pBT; - - if (pC) - { - if (broadcast_type_C == 1 || broadcast_type_C == 2) - { - pC = (const float*)CT_tile + i + ii; - } - if (broadcast_type_C == 4) - { - pC = (const float*)CT_tile + j; - } - } - - int jj = 0; - for (; jj + 11 < max_jj; jj += 12) - { - vfloat32m1_t _sum0; - vfloat32m1_t _sum1; - vfloat32m1_t _sum2; - vfloat32m1_t _sum3; - vfloat32m1_t _sum4; - vfloat32m1_t _sum5; - vfloat32m1_t _sum6; - vfloat32m1_t _sum7; - vfloat32m1_t _sum8; - vfloat32m1_t _sum9; - vfloat32m1_t _suma; - vfloat32m1_t _sumb; - - if (k == 0) - { - _sum0 = vfmv_v_f_f32m1(0.f, vl); - _sum1 = vfmv_v_f_f32m1(0.f, vl); - _sum2 = vfmv_v_f_f32m1(0.f, vl); - _sum3 = vfmv_v_f_f32m1(0.f, vl); - _sum4 = vfmv_v_f_f32m1(0.f, vl); - _sum5 = vfmv_v_f_f32m1(0.f, vl); - _sum6 = vfmv_v_f_f32m1(0.f, vl); - _sum7 = vfmv_v_f_f32m1(0.f, vl); - _sum8 = vfmv_v_f_f32m1(0.f, vl); - _sum9 = vfmv_v_f_f32m1(0.f, vl); - _suma = vfmv_v_f_f32m1(0.f, vl); - _sumb = vfmv_v_f_f32m1(0.f, vl); - - if (pC) - { - if (broadcast_type_C == 0) - { - _sum0 = vfmv_v_f_f32m1(pC[0], vl); - _sum1 = vfmv_v_f_f32m1(pC[0], vl); - _sum2 = vfmv_v_f_f32m1(pC[0], vl); - _sum3 = vfmv_v_f_f32m1(pC[0], vl); - _sum4 = vfmv_v_f_f32m1(pC[0], vl); - _sum5 = vfmv_v_f_f32m1(pC[0], vl); - _sum6 = vfmv_v_f_f32m1(pC[0], vl); - _sum7 = vfmv_v_f_f32m1(pC[0], vl); - _sum8 = vfmv_v_f_f32m1(pC[0], vl); - _sum9 = vfmv_v_f_f32m1(pC[0], vl); - _suma = vfmv_v_f_f32m1(pC[0], vl); - _sumb = vfmv_v_f_f32m1(pC[0], vl); - } - if (broadcast_type_C == 1 || broadcast_type_C == 2) - { - _sum0 = vle32_v_f32m1(pC, vl); - _sum1 = _sum0; - _sum2 = _sum0; - _sum3 = _sum0; - _sum4 = _sum0; - _sum5 = _sum0; - _sum6 = _sum0; - _sum7 = _sum0; - _sum8 = _sum0; - _sum9 = _sum0; - _suma = _sum0; - _sumb = _sum0; - } - if (broadcast_type_C == 3) - { - _sum0 = vle32_v_f32m1(pC, vl); - _sum1 = vle32_v_f32m1(pC + 4, vl); - _sum2 = vle32_v_f32m1(pC + 8, vl); - _sum3 = vle32_v_f32m1(pC + 12, vl); - _sum4 = vle32_v_f32m1(pC + 16, vl); - _sum5 = vle32_v_f32m1(pC + 20, vl); - _sum6 = vle32_v_f32m1(pC + 24, vl); - _sum7 = vle32_v_f32m1(pC + 28, vl); - _sum8 = vle32_v_f32m1(pC + 32, vl); - _sum9 = vle32_v_f32m1(pC + 36, vl); - _suma = vle32_v_f32m1(pC + 40, vl); - _sumb = vle32_v_f32m1(pC + 44, vl); - pC += 48; - } - if (broadcast_type_C == 4) - { - _sum0 = vfmv_v_f_f32m1(pC[0], vl); - _sum1 = vfmv_v_f_f32m1(pC[1], vl); - _sum2 = vfmv_v_f_f32m1(pC[2], vl); - _sum3 = vfmv_v_f_f32m1(pC[3], vl); - _sum4 = vfmv_v_f_f32m1(pC[4], vl); - _sum5 = vfmv_v_f_f32m1(pC[5], vl); - _sum6 = vfmv_v_f_f32m1(pC[6], vl); - _sum7 = vfmv_v_f_f32m1(pC[7], vl); - _sum8 = vfmv_v_f_f32m1(pC[8], vl); - _sum9 = vfmv_v_f_f32m1(pC[9], vl); - _suma = vfmv_v_f_f32m1(pC[10], vl); - _sumb = vfmv_v_f_f32m1(pC[11], vl); - pC += 12; - } - } - } - else - { - _sum0 = vle32_v_f32m1(outptr, vl); - _sum1 = vle32_v_f32m1(outptr + 4 * 1, vl); - _sum2 = vle32_v_f32m1(outptr + 4 * 2, vl); - _sum3 = vle32_v_f32m1(outptr + 4 * 3, vl); - _sum4 = vle32_v_f32m1(outptr + 4 * 4, vl); - _sum5 = vle32_v_f32m1(outptr + 4 * 5, vl); - _sum6 = vle32_v_f32m1(outptr + 4 * 6, vl); - _sum7 = vle32_v_f32m1(outptr + 4 * 7, vl); - _sum8 = vle32_v_f32m1(outptr + 4 * 8, vl); - _sum9 = vle32_v_f32m1(outptr + 4 * 9, vl); - _suma = vle32_v_f32m1(outptr + 4 * 10, vl); - _sumb = vle32_v_f32m1(outptr + 4 * 11, vl); - } - - const float* pA = pAT; - int kk = 0; - for (; kk < max_kk; kk += 1) - { - vfloat32m1_t _pA = vle32_v_f32m1(pA, vl); - - _sum0 = vfmadd_vf_f32m1(_pA, pB[0], _sum0, vl); - _sum1 = vfmadd_vf_f32m1(_pA, pB[1], _sum1, vl); - _sum2 = vfmadd_vf_f32m1(_pA, pB[2], _sum2, vl); - _sum3 = vfmadd_vf_f32m1(_pA, pB[3], _sum3, vl); - _sum4 = vfmadd_vf_f32m1(_pA, pB[4], _sum4, vl); - _sum5 = vfmadd_vf_f32m1(_pA, pB[5], _sum5, vl); - _sum6 = vfmadd_vf_f32m1(_pA, pB[6], _sum6, vl); - _sum7 = vfmadd_vf_f32m1(_pA, pB[7], _sum7, vl); - _sum8 = vfmadd_vf_f32m1(_pA, pB[8], _sum8, vl); - _sum9 = vfmadd_vf_f32m1(_pA, pB[9], _sum9, vl); - _suma = vfmadd_vf_f32m1(_pA, pB[10], _suma, vl); - _sumb = vfmadd_vf_f32m1(_pA, pB[11], _sumb, vl); - - pA += 4; - pB += 12; - } - - if (k_end) - { - if (out_elempack == 4) - { - vse32_v_f32m1(outptr0, _sum0, vl); - vse32_v_f32m1(outptr0 + 4, _sum1, vl); - vse32_v_f32m1(outptr0 + 4 * 2, _sum2, vl); - vse32_v_f32m1(outptr0 + 4 * 3, _sum3, vl); - vse32_v_f32m1(outptr0 + 4 * 4, _sum4, vl); - vse32_v_f32m1(outptr0 + 4 * 5, _sum5, vl); - vse32_v_f32m1(outptr0 + 4 * 6, _sum6, vl); - vse32_v_f32m1(outptr0 + 4 * 7, _sum7, vl); - vse32_v_f32m1(outptr0 + 4 * 8, _sum8, vl); - vse32_v_f32m1(outptr0 + 4 * 9, _sum9, vl); - vse32_v_f32m1(outptr0 + 4 * 10, _suma, vl); - vse32_v_f32m1(outptr0 + 4 * 11, _sumb, vl); - outptr0 += 48; - } - if (out_elempack == 1) - { - transpose4x12_ps(_sum0, _sum1, _sum2, _sum3, _sum4, _sum5, _sum6, _sum7, _sum8, _sum9, _suma, _sumb, vl); - - vse32_v_f32m1(outptr0, _sum0, vl); - vse32_v_f32m1(outptr0 + 4, _sum1, vl); - vse32_v_f32m1(outptr0 + 8, _sum2, vl); - vse32_v_f32m1(outptr0 + out_hstep, _sum3, vl); - vse32_v_f32m1(outptr0 + out_hstep + 4, _sum4, vl); - vse32_v_f32m1(outptr0 + out_hstep + 8, _sum5, vl); - vse32_v_f32m1(outptr0 + out_hstep * 2, _sum6, vl); - vse32_v_f32m1(outptr0 + out_hstep * 2 + 4, _sum7, vl); - vse32_v_f32m1(outptr0 + out_hstep * 2 + 8, _sum8, vl); - vse32_v_f32m1(outptr0 + out_hstep * 3, _sum9, vl); - vse32_v_f32m1(outptr0 + out_hstep * 3 + 4, _suma, vl); - vse32_v_f32m1(outptr0 + out_hstep * 3 + 8, _sumb, vl); - outptr0 += 12; - } - } - else - { - vse32_v_f32m1(outptr, _sum0, vl); - vse32_v_f32m1(outptr + 4, _sum1, vl); - vse32_v_f32m1(outptr + 4 * 2, _sum2, vl); - vse32_v_f32m1(outptr + 4 * 3, _sum3, vl); - vse32_v_f32m1(outptr + 4 * 4, _sum4, vl); - vse32_v_f32m1(outptr + 4 * 5, _sum5, vl); - vse32_v_f32m1(outptr + 4 * 6, _sum6, vl); - vse32_v_f32m1(outptr + 4 * 7, _sum7, vl); - vse32_v_f32m1(outptr + 4 * 8, _sum8, vl); - vse32_v_f32m1(outptr + 4 * 9, _sum9, vl); - vse32_v_f32m1(outptr + 4 * 10, _suma, vl); - vse32_v_f32m1(outptr + 4 * 11, _sumb, vl); - } - - outptr += 48; - } - for (; jj + 7 < max_jj; jj += 8) - { - vfloat32m1_t _sum0; - vfloat32m1_t _sum1; - vfloat32m1_t _sum2; - vfloat32m1_t _sum3; - vfloat32m1_t _sum4; - vfloat32m1_t _sum5; - vfloat32m1_t _sum6; - vfloat32m1_t _sum7; - - if (k == 0) - { - _sum0 = vfmv_v_f_f32m1(0.f, vl); - _sum1 = vfmv_v_f_f32m1(0.f, vl); - _sum2 = vfmv_v_f_f32m1(0.f, vl); - _sum3 = vfmv_v_f_f32m1(0.f, vl); - _sum4 = vfmv_v_f_f32m1(0.f, vl); - _sum5 = vfmv_v_f_f32m1(0.f, vl); - _sum6 = vfmv_v_f_f32m1(0.f, vl); - _sum7 = vfmv_v_f_f32m1(0.f, vl); - - if (pC) - { - if (broadcast_type_C == 0) - { - _sum0 = vfmv_v_f_f32m1(pC[0], vl); - _sum1 = vfmv_v_f_f32m1(pC[0], vl); - _sum2 = vfmv_v_f_f32m1(pC[0], vl); - _sum3 = vfmv_v_f_f32m1(pC[0], vl); - _sum4 = vfmv_v_f_f32m1(pC[0], vl); - _sum5 = vfmv_v_f_f32m1(pC[0], vl); - _sum6 = vfmv_v_f_f32m1(pC[0], vl); - _sum7 = vfmv_v_f_f32m1(pC[0], vl); - } - if (broadcast_type_C == 1 || broadcast_type_C == 2) - { - _sum0 = vle32_v_f32m1(pC, vl); - _sum1 = _sum0; - _sum2 = _sum0; - _sum3 = _sum0; - _sum4 = _sum0; - _sum5 = _sum0; - _sum6 = _sum0; - _sum7 = _sum0; - } - if (broadcast_type_C == 3) - { - _sum0 = vle32_v_f32m1(pC, vl); - _sum1 = vle32_v_f32m1(pC + 4, vl); - _sum2 = vle32_v_f32m1(pC + 8, vl); - _sum3 = vle32_v_f32m1(pC + 12, vl); - _sum4 = vle32_v_f32m1(pC + 16, vl); - _sum5 = vle32_v_f32m1(pC + 20, vl); - _sum6 = vle32_v_f32m1(pC + 24, vl); - _sum7 = vle32_v_f32m1(pC + 28, vl); - pC += 32; - } - if (broadcast_type_C == 4) - { - _sum0 = vfmv_v_f_f32m1(pC[0], vl); - _sum1 = vfmv_v_f_f32m1(pC[1], vl); - _sum2 = vfmv_v_f_f32m1(pC[2], vl); - _sum3 = vfmv_v_f_f32m1(pC[3], vl); - _sum4 = vfmv_v_f_f32m1(pC[4], vl); - _sum5 = vfmv_v_f_f32m1(pC[5], vl); - _sum6 = vfmv_v_f_f32m1(pC[6], vl); - _sum7 = vfmv_v_f_f32m1(pC[7], vl); - pC += 8; - } - } - } - else - { - _sum0 = vle32_v_f32m1(outptr, vl); - _sum1 = vle32_v_f32m1(outptr + 4 * 1, vl); - _sum2 = vle32_v_f32m1(outptr + 4 * 2, vl); - _sum3 = vle32_v_f32m1(outptr + 4 * 3, vl); - _sum4 = vle32_v_f32m1(outptr + 4 * 4, vl); - _sum5 = vle32_v_f32m1(outptr + 4 * 5, vl); - _sum6 = vle32_v_f32m1(outptr + 4 * 6, vl); - _sum7 = vle32_v_f32m1(outptr + 4 * 7, vl); + pC = (const float*)CT_tile + i + ii; } - - const float* pA = pAT; - int kk = 0; - for (; kk < max_kk; kk += 1) + if (broadcast_type_C == 4) { - vfloat32m1_t _pA = vle32_v_f32m1(pA, vl); - - _sum0 = vfmadd_vf_f32m1(_pA, pB[0], _sum0, vl); - _sum1 = vfmadd_vf_f32m1(_pA, pB[1], _sum1, vl); - _sum2 = vfmadd_vf_f32m1(_pA, pB[2], _sum2, vl); - _sum3 = vfmadd_vf_f32m1(_pA, pB[3], _sum3, vl); - _sum4 = vfmadd_vf_f32m1(_pA, pB[4], _sum4, vl); - _sum5 = vfmadd_vf_f32m1(_pA, pB[5], _sum5, vl); - _sum6 = vfmadd_vf_f32m1(_pA, pB[6], _sum6, vl); - _sum7 = vfmadd_vf_f32m1(_pA, pB[7], _sum7, vl); - - pA += 4; - pB += 8; + pC = (const float*)CT_tile + j; } + } - if (k_end) + int jj = 0; + for (; jj + (packn - 1) < max_jj; jj += packn) + { + if (packn == 8) { - if (out_elempack == 4) + vfloat32m1_t _sum0; + vfloat32m1_t _sum1; + vfloat32m1_t _sum2; + vfloat32m1_t _sum3; + vfloat32m1_t _sum4; + vfloat32m1_t _sum5; + vfloat32m1_t _sum6; + vfloat32m1_t _sum7; + + if (k == 0) { - vse32_v_f32m1(outptr0, _sum0, vl); - vse32_v_f32m1(outptr0 + 4, _sum1, vl); - vse32_v_f32m1(outptr0 + 4 * 2, _sum2, vl); - vse32_v_f32m1(outptr0 + 4 * 3, _sum3, vl); - vse32_v_f32m1(outptr0 + 4 * 4, _sum4, vl); - vse32_v_f32m1(outptr0 + 4 * 5, _sum5, vl); - vse32_v_f32m1(outptr0 + 4 * 6, _sum6, vl); - vse32_v_f32m1(outptr0 + 4 * 7, _sum7, vl); - outptr0 += 32; + _sum0 = __riscv_vfmv_v_f_f32m1(0.f, vl); + _sum1 = __riscv_vfmv_v_f_f32m1(0.f, vl); + _sum2 = __riscv_vfmv_v_f_f32m1(0.f, vl); + _sum3 = __riscv_vfmv_v_f_f32m1(0.f, vl); + _sum4 = __riscv_vfmv_v_f_f32m1(0.f, vl); + _sum5 = __riscv_vfmv_v_f_f32m1(0.f, vl); + _sum6 = __riscv_vfmv_v_f_f32m1(0.f, vl); + _sum7 = __riscv_vfmv_v_f_f32m1(0.f, vl); + + if (pC) + { + if (broadcast_type_C == 0) + { + _sum0 = __riscv_vfmv_v_f_f32m1(pC[0], vl); + _sum1 = _sum0; + _sum2 = _sum0; + _sum3 = _sum0; + _sum4 = _sum0; + _sum5 = _sum0; + _sum6 = _sum0; + _sum7 = _sum0; + } + if (broadcast_type_C == 1 || broadcast_type_C == 2) + { + _sum0 = __riscv_vle32_v_f32m1(pC, vl); + _sum1 = _sum0; + _sum2 = _sum0; + _sum3 = _sum0; + _sum4 = _sum0; + _sum5 = _sum0; + _sum6 = _sum0; + _sum7 = _sum0; + } + if (broadcast_type_C == 3) + { + _sum0 = __riscv_vle32_v_f32m1(pC, vl); + _sum1 = __riscv_vle32_v_f32m1(pC + packn, vl); + _sum2 = __riscv_vle32_v_f32m1(pC + packn * 2, vl); + _sum3 = __riscv_vle32_v_f32m1(pC + packn * 3, vl); + _sum4 = __riscv_vle32_v_f32m1(pC + packn * 4, vl); + _sum5 = __riscv_vle32_v_f32m1(pC + packn * 5, vl); + _sum6 = __riscv_vle32_v_f32m1(pC + packn * 6, vl); + _sum7 = __riscv_vle32_v_f32m1(pC + packn * 7, vl); + pC += packn * 8; + } + if (broadcast_type_C == 4) + { + _sum0 = __riscv_vfmv_v_f_f32m1(pC[0], vl); + _sum1 = __riscv_vfmv_v_f_f32m1(pC[1], vl); + _sum2 = __riscv_vfmv_v_f_f32m1(pC[2], vl); + _sum3 = __riscv_vfmv_v_f_f32m1(pC[3], vl); + _sum4 = __riscv_vfmv_v_f_f32m1(pC[4], vl); + _sum5 = __riscv_vfmv_v_f_f32m1(pC[5], vl); + _sum6 = __riscv_vfmv_v_f_f32m1(pC[6], vl); + _sum7 = __riscv_vfmv_v_f_f32m1(pC[7], vl); + pC += 8; + } + } } - if (out_elempack == 1) + else { - transpose4x8_ps(_sum0, _sum1, _sum2, _sum3, _sum4, _sum5, _sum6, _sum7, vl); - - vse32_v_f32m1(outptr0, _sum0, vl); - vse32_v_f32m1(outptr0 + 4, _sum1, vl); - vse32_v_f32m1(outptr0 + out_hstep, _sum2, vl); - vse32_v_f32m1(outptr0 + out_hstep + 4, _sum3, vl); - vse32_v_f32m1(outptr0 + out_hstep * 2, _sum4, vl); - vse32_v_f32m1(outptr0 + out_hstep * 2 + 4, _sum5, vl); - vse32_v_f32m1(outptr0 + out_hstep * 3, _sum6, vl); - vse32_v_f32m1(outptr0 + out_hstep * 3 + 4, _sum7, vl); - outptr0 += 8; + _sum0 = __riscv_vle32_v_f32m1(outptr, vl); + _sum1 = __riscv_vle32_v_f32m1(outptr + packn, vl); + _sum2 = __riscv_vle32_v_f32m1(outptr + packn * 2, vl); + _sum3 = __riscv_vle32_v_f32m1(outptr + packn * 3, vl); + _sum4 = __riscv_vle32_v_f32m1(outptr + packn * 4, vl); + _sum5 = __riscv_vle32_v_f32m1(outptr + packn * 5, vl); + _sum6 = __riscv_vle32_v_f32m1(outptr + packn * 6, vl); + _sum7 = __riscv_vle32_v_f32m1(outptr + packn * 7, vl); } - } - else - { - vse32_v_f32m1(outptr, _sum0, vl); - vse32_v_f32m1(outptr + 4, _sum1, vl); - vse32_v_f32m1(outptr + 4 * 2, _sum2, vl); - vse32_v_f32m1(outptr + 4 * 3, _sum3, vl); - vse32_v_f32m1(outptr + 4 * 4, _sum4, vl); - vse32_v_f32m1(outptr + 4 * 5, _sum5, vl); - vse32_v_f32m1(outptr + 4 * 6, _sum6, vl); - vse32_v_f32m1(outptr + 4 * 7, _sum7, vl); - } - - outptr += 32; - } - for (; jj + 3 < max_jj; jj += 4) - { - vfloat32m1_t _sum0; - vfloat32m1_t _sum1; - vfloat32m1_t _sum2; - vfloat32m1_t _sum3; - if (k == 0) - { - _sum0 = vfmv_v_f_f32m1(0.f, vl); - _sum1 = vfmv_v_f_f32m1(0.f, vl); - _sum2 = vfmv_v_f_f32m1(0.f, vl); - _sum3 = vfmv_v_f_f32m1(0.f, vl); + const float* pA = pAT; + int kk = 0; + for (; kk < max_kk; kk += 1) + { + vfloat32m1_t _pA = __riscv_vle32_v_f32m1(pA, vl); + _sum0 = __riscv_vfmadd_vf_f32m1(_pA, pB[0], _sum0, vl); + _sum1 = __riscv_vfmadd_vf_f32m1(_pA, pB[1], _sum1, vl); + _sum2 = __riscv_vfmadd_vf_f32m1(_pA, pB[2], _sum2, vl); + _sum3 = __riscv_vfmadd_vf_f32m1(_pA, pB[3], _sum3, vl); + _sum4 = __riscv_vfmadd_vf_f32m1(_pA, pB[4], _sum4, vl); + _sum5 = __riscv_vfmadd_vf_f32m1(_pA, pB[5], _sum5, vl); + _sum6 = __riscv_vfmadd_vf_f32m1(_pA, pB[6], _sum6, vl); + _sum7 = __riscv_vfmadd_vf_f32m1(_pA, pB[7], _sum7, vl); + pA += packn; + pB += 8; + } - if (pC) + if (k_end) { - if (broadcast_type_C == 0) - { - _sum0 = vfmv_v_f_f32m1(pC[0], vl); - _sum1 = vfmv_v_f_f32m1(pC[0], vl); - _sum2 = vfmv_v_f_f32m1(pC[0], vl); - _sum3 = vfmv_v_f_f32m1(pC[0], vl); - } - if (broadcast_type_C == 1 || broadcast_type_C == 2) - { - _sum0 = vle32_v_f32m1(pC, vl); - _sum1 = _sum0; - _sum2 = _sum0; - _sum3 = _sum0; - } - if (broadcast_type_C == 3) + if (out_elempack == packn) { - _sum0 = vle32_v_f32m1(pC, vl); - _sum1 = vle32_v_f32m1(pC + 4, vl); - _sum2 = vle32_v_f32m1(pC + 8, vl); - _sum3 = vle32_v_f32m1(pC + 12, vl); - pC += 16; + __riscv_vse32_v_f32m1(outptr0, _sum0, vl); + __riscv_vse32_v_f32m1(outptr0 + packn, _sum1, vl); + __riscv_vse32_v_f32m1(outptr0 + packn * 2, _sum2, vl); + __riscv_vse32_v_f32m1(outptr0 + packn * 3, _sum3, vl); + __riscv_vse32_v_f32m1(outptr0 + packn * 4, _sum4, vl); + __riscv_vse32_v_f32m1(outptr0 + packn * 5, _sum5, vl); + __riscv_vse32_v_f32m1(outptr0 + packn * 6, _sum6, vl); + __riscv_vse32_v_f32m1(outptr0 + packn * 7, _sum7, vl); + outptr0 += packn * 8; } - if (broadcast_type_C == 4) + if (out_elempack == 1) { - _sum0 = vfmv_v_f_f32m1(pC[0], vl); - _sum1 = vfmv_v_f_f32m1(pC[1], vl); - _sum2 = vfmv_v_f_f32m1(pC[2], vl); - _sum3 = vfmv_v_f_f32m1(pC[3], vl); - pC += 4; + vfloat32m1x8_t _sum = __riscv_vcreate_v_f32m1x8(_sum0, _sum1, _sum2, _sum3, _sum4, _sum5, _sum6, _sum7); + __riscv_vssseg8e32_v_f32m1x8(outptr0, out_hstep * sizeof(float), _sum, vl); + outptr0 += 8; } } - } - else - { - _sum0 = vle32_v_f32m1(outptr, vl); - _sum1 = vle32_v_f32m1(outptr + 4 * 1, vl); - _sum2 = vle32_v_f32m1(outptr + 4 * 2, vl); - _sum3 = vle32_v_f32m1(outptr + 4 * 3, vl); - } + else + { + __riscv_vse32_v_f32m1(outptr, _sum0, vl); + __riscv_vse32_v_f32m1(outptr + packn, _sum1, vl); + __riscv_vse32_v_f32m1(outptr + packn * 2, _sum2, vl); + __riscv_vse32_v_f32m1(outptr + packn * 3, _sum3, vl); + __riscv_vse32_v_f32m1(outptr + packn * 4, _sum4, vl); + __riscv_vse32_v_f32m1(outptr + packn * 5, _sum5, vl); + __riscv_vse32_v_f32m1(outptr + packn * 6, _sum6, vl); + __riscv_vse32_v_f32m1(outptr + packn * 7, _sum7, vl); + } - const float* pA = pAT; - int kk = 0; - for (; kk < max_kk; kk += 1) - { - vfloat32m1_t _pA = vle32_v_f32m1(pA, vl); - - _sum0 = vfmadd_vf_f32m1(_pA, pB[0], _sum0, vl); - _sum1 = vfmadd_vf_f32m1(_pA, pB[1], _sum1, vl); - _sum2 = vfmadd_vf_f32m1(_pA, pB[2], _sum2, vl); - _sum3 = vfmadd_vf_f32m1(_pA, pB[3], _sum3, vl); - pA += 4; - pB += 4; + outptr += packn * 8; } - - if (k_end) + else if (packn == 4) { - if (out_elempack == 4) + vfloat32m1_t _sum0; + vfloat32m1_t _sum1; + vfloat32m1_t _sum2; + vfloat32m1_t _sum3; + + if (k == 0) { - vse32_v_f32m1(outptr0, _sum0, vl); - vse32_v_f32m1(outptr0 + 4, _sum1, vl); - vse32_v_f32m1(outptr0 + 4 * 2, _sum2, vl); - vse32_v_f32m1(outptr0 + 4 * 3, _sum3, vl); - outptr0 += 16; + _sum0 = __riscv_vfmv_v_f_f32m1(0.f, vl); + _sum1 = __riscv_vfmv_v_f_f32m1(0.f, vl); + _sum2 = __riscv_vfmv_v_f_f32m1(0.f, vl); + _sum3 = __riscv_vfmv_v_f_f32m1(0.f, vl); + + if (pC) + { + if (broadcast_type_C == 0) + { + _sum0 = __riscv_vfmv_v_f_f32m1(pC[0], vl); + _sum1 = _sum0; + _sum2 = _sum0; + _sum3 = _sum0; + } + if (broadcast_type_C == 1 || broadcast_type_C == 2) + { + _sum0 = __riscv_vle32_v_f32m1(pC, vl); + _sum1 = _sum0; + _sum2 = _sum0; + _sum3 = _sum0; + } + if (broadcast_type_C == 3) + { + _sum0 = __riscv_vle32_v_f32m1(pC, vl); + _sum1 = __riscv_vle32_v_f32m1(pC + packn, vl); + _sum2 = __riscv_vle32_v_f32m1(pC + packn * 2, vl); + _sum3 = __riscv_vle32_v_f32m1(pC + packn * 3, vl); + pC += packn * 4; + } + if (broadcast_type_C == 4) + { + _sum0 = __riscv_vfmv_v_f_f32m1(pC[0], vl); + _sum1 = __riscv_vfmv_v_f_f32m1(pC[1], vl); + _sum2 = __riscv_vfmv_v_f_f32m1(pC[2], vl); + _sum3 = __riscv_vfmv_v_f_f32m1(pC[3], vl); + pC += 4; + } + } } - if (out_elempack == 1) + else { - transpose4x4_ps(_sum0, _sum1, _sum2, _sum3, vl); + _sum0 = __riscv_vle32_v_f32m1(outptr, vl); + _sum1 = __riscv_vle32_v_f32m1(outptr + packn, vl); + _sum2 = __riscv_vle32_v_f32m1(outptr + packn * 2, vl); + _sum3 = __riscv_vle32_v_f32m1(outptr + packn * 3, vl); + } + + const float* pA = pAT; + int kk = 0; + for (; kk < max_kk; kk += 1) + { + vfloat32m1_t _pA = __riscv_vle32_v_f32m1(pA, vl); + _sum0 = __riscv_vfmadd_vf_f32m1(_pA, pB[0], _sum0, vl); + _sum1 = __riscv_vfmadd_vf_f32m1(_pA, pB[1], _sum1, vl); + _sum2 = __riscv_vfmadd_vf_f32m1(_pA, pB[2], _sum2, vl); + _sum3 = __riscv_vfmadd_vf_f32m1(_pA, pB[3], _sum3, vl); + pA += packn; + pB += 4; + } - vse32_v_f32m1(outptr0, _sum0, vl); - vse32_v_f32m1(outptr0 + out_hstep * 1, _sum1, vl); - vse32_v_f32m1(outptr0 + out_hstep * 2, _sum2, vl); - vse32_v_f32m1(outptr0 + out_hstep * 3, _sum3, vl); - outptr0 += 4; + if (k_end) + { + if (out_elempack == packn) + { + __riscv_vse32_v_f32m1(outptr0, _sum0, vl); + __riscv_vse32_v_f32m1(outptr0 + packn, _sum1, vl); + __riscv_vse32_v_f32m1(outptr0 + packn * 2, _sum2, vl); + __riscv_vse32_v_f32m1(outptr0 + packn * 3, _sum3, vl); + outptr0 += packn * 4; + } + if (out_elempack == 1) + { + vfloat32m1x4_t _sum = __riscv_vcreate_v_f32m1x4(_sum0, _sum1, _sum2, _sum3); + __riscv_vssseg4e32_v_f32m1x4(outptr0, out_hstep * sizeof(float), _sum, vl); + outptr0 += 4; + } + } + else + { + __riscv_vse32_v_f32m1(outptr, _sum0, vl); + __riscv_vse32_v_f32m1(outptr + packn, _sum1, vl); + __riscv_vse32_v_f32m1(outptr + packn * 2, _sum2, vl); + __riscv_vse32_v_f32m1(outptr + packn * 3, _sum3, vl); } + + outptr += packn * 4; } else { - vse32_v_f32m1(outptr, _sum0, vl); - vse32_v_f32m1(outptr + 4, _sum1, vl); - vse32_v_f32m1(outptr + 4 * 2, _sum2, vl); - vse32_v_f32m1(outptr + 4 * 3, _sum3, vl); + NCNN_LOGE("unsupported vector length"); } - - outptr += 16; } for (; jj + 1 < max_jj; jj += 2) { @@ -2587,87 +839,76 @@ static void gemm_transB_packed_tile(const Mat& AT_tile, const Mat& BT_tile, cons if (k == 0) { - _sum0 = vfmv_v_f_f32m1(0.f, vl); - _sum1 = vfmv_v_f_f32m1(0.f, vl); + _sum0 = __riscv_vfmv_v_f_f32m1(0.f, vl); + _sum1 = __riscv_vfmv_v_f_f32m1(0.f, vl); if (pC) { if (broadcast_type_C == 0) { - _sum0 = vfmv_v_f_f32m1(pC[0], vl); - _sum1 = vfmv_v_f_f32m1(pC[0], vl); + _sum0 = __riscv_vfmv_v_f_f32m1(pC[0], vl); + _sum1 = __riscv_vfmv_v_f_f32m1(pC[0], vl); } if (broadcast_type_C == 1 || broadcast_type_C == 2) { - _sum0 = vle32_v_f32m1(pC, vl); + _sum0 = __riscv_vle32_v_f32m1(pC, vl); _sum1 = _sum0; } if (broadcast_type_C == 3) { - _sum0 = vle32_v_f32m1(pC, vl); - _sum1 = vle32_v_f32m1(pC + 4, vl); - pC += 8; + _sum0 = __riscv_vle32_v_f32m1(pC, vl); + _sum1 = __riscv_vle32_v_f32m1(pC + packn, vl); + pC += packn * 2; } if (broadcast_type_C == 4) { - _sum0 = vfmv_v_f_f32m1(pC[0], vl); - _sum1 = vfmv_v_f_f32m1(pC[1], vl); + _sum0 = __riscv_vfmv_v_f_f32m1(pC[0], vl); + _sum1 = __riscv_vfmv_v_f_f32m1(pC[1], vl); pC += 2; } } } else { - _sum0 = vle32_v_f32m1(outptr, vl); - _sum1 = vle32_v_f32m1(outptr + 4, vl); + _sum0 = __riscv_vle32_v_f32m1(outptr, vl); + _sum1 = __riscv_vle32_v_f32m1(outptr + packn, vl); } const float* pA = pAT; int kk = 0; for (; kk < max_kk; kk += 1) { - vfloat32m1_t _pA = vle32_v_f32m1(pA, vl); + vfloat32m1_t _pA = __riscv_vle32_v_f32m1(pA, vl); - _sum0 = vfmadd_vf_f32m1(_pA, pB[0], _sum0, vl); - _sum1 = vfmadd_vf_f32m1(_pA, pB[1], _sum1, vl); + _sum0 = __riscv_vfmadd_vf_f32m1(_pA, pB[0], _sum0, vl); + _sum1 = __riscv_vfmadd_vf_f32m1(_pA, pB[1], _sum1, vl); - pA += 4; + pA += packn; pB += 2; } if (k_end) { - if (out_elempack == 4) + if (out_elempack == packn) { - vse32_v_f32m1(outptr0, _sum0, vl); - vse32_v_f32m1(outptr0 + 4, _sum1, vl); - outptr0 += 8; + __riscv_vse32_v_f32m1(outptr0, _sum0, vl); + __riscv_vse32_v_f32m1(outptr0 + packn, _sum1, vl); + outptr0 += packn * 2; } if (out_elempack == 1) { - float sum0[4]; - float sum1[4]; - vse32_v_f32m1(sum0, _sum0, vl); - vse32_v_f32m1(sum1, _sum1, vl); - - outptr0[0] = sum0[0]; - outptr0[out_hstep] = sum0[1]; - outptr0[out_hstep * 2] = sum0[2]; - outptr0[out_hstep * 3] = sum0[3]; - outptr0[1] = sum1[0]; - outptr0[out_hstep + 1] = sum1[1]; - outptr0[out_hstep * 2 + 1] = sum1[2]; - outptr0[out_hstep * 3 + 1] = sum1[3]; + vfloat32m1x2_t _sum = __riscv_vcreate_v_f32m1x2(_sum0, _sum1); + __riscv_vssseg2e32_v_f32m1x2(outptr0, out_hstep * sizeof(float), _sum, vl); outptr0 += 2; } } else { - vse32_v_f32m1(outptr, _sum0, vl); - vse32_v_f32m1(outptr + 4, _sum1, vl); + __riscv_vse32_v_f32m1(outptr, _sum0, vl); + __riscv_vse32_v_f32m1(outptr + packn, _sum1, vl); } - outptr += 8; + outptr += packn * 2; } for (; jj < max_jj; jj += 1) { @@ -2675,76 +916,70 @@ static void gemm_transB_packed_tile(const Mat& AT_tile, const Mat& BT_tile, cons if (k == 0) { - _sum0 = vfmv_v_f_f32m1(0.f, vl); + _sum0 = __riscv_vfmv_v_f_f32m1(0.f, vl); if (pC) { if (broadcast_type_C == 0) { - _sum0 = vfmv_v_f_f32m1(pC[0], vl); + _sum0 = __riscv_vfmv_v_f_f32m1(pC[0], vl); } if (broadcast_type_C == 1 || broadcast_type_C == 2) { - _sum0 = vle32_v_f32m1(pC, vl); + _sum0 = __riscv_vle32_v_f32m1(pC, vl); } if (broadcast_type_C == 3) { - _sum0 = vle32_v_f32m1(pC, vl); - pC += 4; + _sum0 = __riscv_vle32_v_f32m1(pC, vl); + pC += packn; } if (broadcast_type_C == 4) { - _sum0 = vfmv_v_f_f32m1(pC[0], vl); + _sum0 = __riscv_vfmv_v_f_f32m1(pC[0], vl); pC += 1; } } } else { - _sum0 = vle32_v_f32m1(outptr, vl); + _sum0 = __riscv_vle32_v_f32m1(outptr, vl); } const float* pA = pAT; int kk = 0; for (; kk < max_kk; kk += 1) { - vfloat32m1_t _pA = vle32_v_f32m1(pA, vl); - vfloat32m1_t _pB = vfmv_v_f_f32m1(pB[0], vl); + vfloat32m1_t _pA = __riscv_vle32_v_f32m1(pA, vl); + vfloat32m1_t _pB = __riscv_vfmv_v_f_f32m1(pB[0], vl); - _sum0 = vfmadd_vv_f32m1(_pA, _pB, _sum0, vl); + _sum0 = __riscv_vfmadd_vv_f32m1(_pA, _pB, _sum0, vl); - pA += 4; + pA += packn; pB += 1; } if (k_end) { - if (out_elempack == 4) + if (out_elempack == packn) { - vse32_v_f32m1(outptr0, _sum0, vl); - outptr0 += 4; + __riscv_vse32_v_f32m1(outptr0, _sum0, vl); + outptr0 += packn; } if (out_elempack == 1) { - float sum0[4]; - vse32_v_f32m1(sum0, _sum0, vl); - - outptr0[0] = sum0[0]; - outptr0[out_hstep] = sum0[1]; - outptr0[out_hstep * 2] = sum0[2]; - outptr0[out_hstep * 3] = sum0[3]; + __riscv_vsse32_v_f32m1(outptr0, out_hstep * sizeof(float), _sum0, vl); outptr0++; } } else { - vse32_v_f32m1(outptr, _sum0, vl); + __riscv_vse32_v_f32m1(outptr, _sum0, vl); } - outptr += 4; + outptr += packn; } - pAT += max_kk * 4; + pAT += max_kk * packn; } #endif // __riscv_vector for (; ii + 1 < max_ii; ii += 2) @@ -2767,269 +1002,78 @@ static void gemm_transB_packed_tile(const Mat& AT_tile, const Mat& BT_tile, cons int jj = 0; #if __riscv_vector - for (; jj + 11 < max_jj; jj += 12) - { - vfloat32m1_t _sum00; - vfloat32m1_t _sum01; - vfloat32m1_t _sum02; - vfloat32m1_t _sum10; - vfloat32m1_t _sum11; - vfloat32m1_t _sum12; - - if (k == 0) - { - _sum00 = vfmv_v_f_f32m1(0.f, vl); - _sum01 = vfmv_v_f_f32m1(0.f, vl); - _sum02 = vfmv_v_f_f32m1(0.f, vl); - _sum10 = vfmv_v_f_f32m1(0.f, vl); - _sum11 = vfmv_v_f_f32m1(0.f, vl); - _sum12 = vfmv_v_f_f32m1(0.f, vl); - - if (pC) - { - if (broadcast_type_C == 0) - { - _sum00 = vfmv_v_f_f32m1(pC[0], vl); - _sum01 = vfmv_v_f_f32m1(pC[0], vl); - _sum02 = vfmv_v_f_f32m1(pC[0], vl); - _sum10 = vfmv_v_f_f32m1(pC[0], vl); - _sum11 = vfmv_v_f_f32m1(pC[0], vl); - _sum12 = vfmv_v_f_f32m1(pC[0], vl); - } - if (broadcast_type_C == 1 || broadcast_type_C == 2) - { - _sum00 = vfmv_v_f_f32m1(pC[0], vl); - _sum01 = vfmv_v_f_f32m1(pC[0], vl); - _sum02 = vfmv_v_f_f32m1(pC[0], vl); - _sum10 = vfmv_v_f_f32m1(pC[1], vl); - _sum11 = vfmv_v_f_f32m1(pC[1], vl); - _sum12 = vfmv_v_f_f32m1(pC[1], vl); - } - if (broadcast_type_C == 3) - { - vlseg2e32_v_f32m1(&_sum00, &_sum10, pC, vl); - vlseg2e32_v_f32m1(&_sum01, &_sum11, pC + 8, vl); - vlseg2e32_v_f32m1(&_sum02, &_sum12, pC + 16, vl); - pC += 24; - } - if (broadcast_type_C == 4) - { - _sum00 = vle32_v_f32m1(pC, vl); - _sum01 = vle32_v_f32m1(pC + 4, vl); - _sum02 = vle32_v_f32m1(pC + 8, vl); - _sum10 = _sum00; - _sum11 = _sum01; - _sum12 = _sum02; - pC += 12; - } - } - } - else - { - vlseg2e32_v_f32m1(&_sum00, &_sum10, outptr, vl); - vlseg2e32_v_f32m1(&_sum01, &_sum11, outptr + 8, vl); - vlseg2e32_v_f32m1(&_sum02, &_sum12, outptr + 16, vl); - } - - const float* pA = pAT; - int kk = 0; - for (; kk < max_kk; kk += 1) - { - vfloat32m1_t _pB0 = vle32_v_f32m1(pB, vl); - vfloat32m1_t _pB1 = vle32_v_f32m1(pB + 4, vl); - vfloat32m1_t _pB2 = vle32_v_f32m1(pB + 8, vl); - - _sum00 = vfmadd_vf_f32m1(_pB0, pA[0], _sum00, vl); - _sum01 = vfmadd_vf_f32m1(_pB1, pA[0], _sum01, vl); - _sum02 = vfmadd_vf_f32m1(_pB2, pA[0], _sum02, vl); - _sum10 = vfmadd_vf_f32m1(_pB0, pA[1], _sum10, vl); - _sum11 = vfmadd_vf_f32m1(_pB1, pA[1], _sum11, vl); - _sum12 = vfmadd_vf_f32m1(_pB2, pA[1], _sum12, vl); - - pA += 2; - pB += 12; - } - - if (k_end) - { - // if (out_elempack == 1) - { - vse32_v_f32m1(outptr0, _sum00, vl); - vse32_v_f32m1(outptr0 + 4, _sum01, vl); - vse32_v_f32m1(outptr0 + 8, _sum02, vl); - vse32_v_f32m1(outptr0 + out_hstep, _sum10, vl); - vse32_v_f32m1(outptr0 + out_hstep + 4, _sum11, vl); - vse32_v_f32m1(outptr0 + out_hstep + 8, _sum12, vl); - outptr0 += 12; - } - } - else - { - vsseg2e32_v_f32m1(outptr, _sum00, _sum10, vl); - vsseg2e32_v_f32m1(outptr + 8, _sum01, _sum11, vl); - vsseg2e32_v_f32m1(outptr + 16, _sum02, _sum12, vl); - } - - outptr += 24; - } - for (; jj + 7 < max_jj; jj += 8) - { - vfloat32m1_t _sum00; - vfloat32m1_t _sum01; - vfloat32m1_t _sum10; - vfloat32m1_t _sum11; - - if (k == 0) - { - _sum00 = vfmv_v_f_f32m1(0.f, vl); - _sum01 = vfmv_v_f_f32m1(0.f, vl); - _sum10 = vfmv_v_f_f32m1(0.f, vl); - _sum11 = vfmv_v_f_f32m1(0.f, vl); - - if (pC) - { - if (broadcast_type_C == 0) - { - _sum00 = vfmv_v_f_f32m1(pC[0], vl); - _sum01 = vfmv_v_f_f32m1(pC[0], vl); - _sum10 = vfmv_v_f_f32m1(pC[0], vl); - _sum11 = vfmv_v_f_f32m1(pC[0], vl); - } - if (broadcast_type_C == 1 || broadcast_type_C == 2) - { - _sum00 = vfmv_v_f_f32m1(pC[0], vl); - _sum01 = vfmv_v_f_f32m1(pC[0], vl); - _sum10 = vfmv_v_f_f32m1(pC[1], vl); - _sum11 = vfmv_v_f_f32m1(pC[1], vl); - } - if (broadcast_type_C == 3) - { - vlseg2e32_v_f32m1(&_sum00, &_sum10, pC, vl); - vlseg2e32_v_f32m1(&_sum01, &_sum11, pC + 8, vl); - pC += 16; - } - if (broadcast_type_C == 4) - { - _sum00 = vle32_v_f32m1(pC, vl); - _sum01 = vle32_v_f32m1(pC + 4, vl); - _sum10 = _sum00; - _sum11 = _sum01; - pC += 8; - } - } - } - else - { - vlseg2e32_v_f32m1(&_sum00, &_sum10, outptr, vl); - vlseg2e32_v_f32m1(&_sum01, &_sum11, outptr + 8, vl); - } - - const float* pA = pAT; - int kk = 0; - for (; kk < max_kk; kk += 1) - { - vfloat32m1_t _pB0 = vle32_v_f32m1(pB, vl); - vfloat32m1_t _pB1 = vle32_v_f32m1(pB + 4, vl); - - _sum00 = vfmadd_vf_f32m1(_pB0, pA[0], _sum00, vl); - _sum01 = vfmadd_vf_f32m1(_pB1, pA[0], _sum01, vl); - _sum10 = vfmadd_vf_f32m1(_pB0, pA[1], _sum10, vl); - _sum11 = vfmadd_vf_f32m1(_pB1, pA[1], _sum11, vl); - pA += 2; - pB += 8; - } - - if (k_end) - { - // if (out_elempack == 1) - { - vse32_v_f32m1(outptr0, _sum00, vl); - vse32_v_f32m1(outptr0 + 4, _sum01, vl); - vse32_v_f32m1(outptr0 + out_hstep, _sum10, vl); - vse32_v_f32m1(outptr0 + out_hstep + 4, _sum11, vl); - outptr0 += 8; - } - } - else - { - vsseg2e32_v_f32m1(outptr, _sum00, _sum10, vl); - vsseg2e32_v_f32m1(outptr + 8, _sum01, _sum11, vl); - } - - outptr += 16; - } - for (; jj + 3 < max_jj; jj += 4) + for (; jj + (packn - 1) < max_jj; jj += packn) { vfloat32m1_t _sum0; vfloat32m1_t _sum1; if (k == 0) { - _sum0 = vfmv_v_f_f32m1(0.f, vl); - _sum1 = vfmv_v_f_f32m1(0.f, vl); + _sum0 = __riscv_vfmv_v_f_f32m1(0.f, vl); + _sum1 = __riscv_vfmv_v_f_f32m1(0.f, vl); if (pC) { if (broadcast_type_C == 0) { - _sum0 = vfmv_v_f_f32m1(pC[0], vl); - _sum1 = vfmv_v_f_f32m1(pC[0], vl); + _sum0 = __riscv_vfmv_v_f_f32m1(pC[0], vl); + _sum1 = __riscv_vfmv_v_f_f32m1(pC[0], vl); } if (broadcast_type_C == 1 || broadcast_type_C == 2) { - _sum0 = vfmv_v_f_f32m1(pC[0], vl); - _sum1 = vfmv_v_f_f32m1(pC[1], vl); + _sum0 = __riscv_vfmv_v_f_f32m1(pC[0], vl); + _sum1 = __riscv_vfmv_v_f_f32m1(pC[1], vl); } if (broadcast_type_C == 3) { - vlseg2e32_v_f32m1(&_sum0, &_sum1, pC, vl); - pC += 8; + vfloat32m1x2_t _s0 = __riscv_vlseg2e32_v_f32m1x2(pC, vl); + _sum0 = __riscv_vget_v_f32m1x2_f32m1(_s0, 0); + _sum1 = __riscv_vget_v_f32m1x2_f32m1(_s0, 1); + pC += packn * 2; } if (broadcast_type_C == 4) { - _sum0 = vle32_v_f32m1(pC, vl); + _sum0 = __riscv_vle32_v_f32m1(pC, vl); _sum1 = _sum0; - pC += 4; + pC += packn; } } } else { - vfloat32m1_t _tmp0; - vfloat32m1_t _tmp1; - vlseg2e32_v_f32m1(&_tmp0, &_tmp1, outptr, vl); - _sum0 = _tmp0; - _sum1 = _tmp1; + vfloat32m1x2_t _s0 = __riscv_vlseg2e32_v_f32m1x2(outptr, vl); + _sum0 = __riscv_vget_v_f32m1x2_f32m1(_s0, 0); + _sum1 = __riscv_vget_v_f32m1x2_f32m1(_s0, 1); } const float* pA = pAT; int kk = 0; for (; kk < max_kk; kk += 1) { - vfloat32m1_t _pB = vle32_v_f32m1(pB, vl); + vfloat32m1_t _pB = __riscv_vle32_v_f32m1(pB, vl); - _sum0 = vfmadd_vf_f32m1(_pB, pA[0], _sum0, vl); - _sum1 = vfmadd_vf_f32m1(_pB, pA[1], _sum1, vl); + _sum0 = __riscv_vfmadd_vf_f32m1(_pB, pA[0], _sum0, vl); + _sum1 = __riscv_vfmadd_vf_f32m1(_pB, pA[1], _sum1, vl); pA += 2; - pB += 4; + pB += packn; } if (k_end) { // if (out_elempack == 1) { - vse32_v_f32m1(outptr0, _sum0, vl); - vse32_v_f32m1(outptr0 + out_hstep, _sum1, vl); - outptr0 += 4; + __riscv_vse32_v_f32m1(outptr0, _sum0, vl); + __riscv_vse32_v_f32m1(outptr0 + out_hstep, _sum1, vl); + outptr0 += packn; } } else { - vsseg2e32_v_f32m1(outptr, _sum0, _sum1, vl); + __riscv_vsseg2e32_v_f32m1x2(outptr, __riscv_vcreate_v_f32m1x2(_sum0, _sum1), vl); } - outptr += 8; + outptr += packn * 2; } #endif // __riscv_vector for (; jj + 1 < max_jj; jj += 2) @@ -3214,195 +1258,59 @@ static void gemm_transB_packed_tile(const Mat& AT_tile, const Mat& BT_tile, cons int jj = 0; #if __riscv_vector - for (; jj + 11 < max_jj; jj += 12) - { - vfloat32m1_t _sum0; - vfloat32m1_t _sum1; - vfloat32m1_t _sum2; - - if (k == 0) - { - _sum0 = vfmv_v_f_f32m1(0.f, vl); - _sum1 = vfmv_v_f_f32m1(0.f, vl); - _sum2 = vfmv_v_f_f32m1(0.f, vl); - - if (pC) - { - if (broadcast_type_C == 0 || broadcast_type_C == 1 || broadcast_type_C == 2) - { - _sum0 = vfmv_v_f_f32m1(pC[0], vl); - _sum1 = vfmv_v_f_f32m1(pC[0], vl); - _sum2 = vfmv_v_f_f32m1(pC[0], vl); - } - if (broadcast_type_C == 3 || broadcast_type_C == 4) - { - _sum0 = vle32_v_f32m1(pC, vl); - _sum1 = vle32_v_f32m1(pC + 4, vl); - _sum2 = vle32_v_f32m1(pC + 8, vl); - pC += 12; - } - } - } - else - { - _sum0 = vle32_v_f32m1(outptr, vl); - _sum1 = vle32_v_f32m1(outptr + 4, vl); - _sum2 = vle32_v_f32m1(outptr + 8, vl); - } - - const float* pA = pAT; - int kk = 0; - for (; kk < max_kk; kk += 1) - { - vfloat32m1_t _pB0 = vle32_v_f32m1(pB, vl); - vfloat32m1_t _pB1 = vle32_v_f32m1(pB + 4, vl); - vfloat32m1_t _pB2 = vle32_v_f32m1(pB + 8, vl); - - vfloat32m1_t _pA0 = vfmv_v_f_f32m1(pA[0], vl); - - _sum0 = vfmadd_vv_f32m1(_pA0, _pB0, _sum0, vl); - _sum1 = vfmadd_vv_f32m1(_pA0, _pB1, _sum1, vl); - _sum2 = vfmadd_vv_f32m1(_pA0, _pB2, _sum2, vl); - - pA += 1; - pB += 12; - } - - if (k_end) - { - // if (out_elempack == 1) - { - vse32_v_f32m1(outptr0, _sum0, vl); - vse32_v_f32m1(outptr0 + 4, _sum1, vl); - vse32_v_f32m1(outptr0 + 8, _sum2, vl); - outptr0 += 12; - } - } - else - { - vse32_v_f32m1(outptr, _sum0, vl); - vse32_v_f32m1(outptr + 4, _sum1, vl); - vse32_v_f32m1(outptr + 8, _sum2, vl); - } - - outptr += 12; - } - for (; jj + 7 < max_jj; jj += 8) - { - vfloat32m1_t _sum0; - vfloat32m1_t _sum1; - - if (k == 0) - { - _sum0 = vfmv_v_f_f32m1(0.f, vl); - _sum1 = vfmv_v_f_f32m1(0.f, vl); - - if (pC) - { - if (broadcast_type_C == 0 || broadcast_type_C == 1 || broadcast_type_C == 2) - { - _sum0 = vfmv_v_f_f32m1(pC[0], vl); - _sum1 = vfmv_v_f_f32m1(pC[0], vl); - } - if (broadcast_type_C == 3 || broadcast_type_C == 4) - { - _sum0 = vle32_v_f32m1(pC, vl); - _sum1 = vle32_v_f32m1(pC + 4, vl); - pC += 8; - } - } - } - else - { - _sum0 = vle32_v_f32m1(outptr, vl); - _sum1 = vle32_v_f32m1(outptr + 4, vl); - } - - const float* pA = pAT; - int kk = 0; - for (; kk < max_kk; kk += 1) - { - vfloat32m1_t _pB0 = vle32_v_f32m1(pB, vl); - vfloat32m1_t _pB1 = vle32_v_f32m1(pB + 4, vl); - - vfloat32m1_t _pA0 = vfmv_v_f_f32m1(pA[0], vl); - _sum0 = vfmadd_vv_f32m1(_pA0, _pB0, _sum0, vl); - _sum1 = vfmadd_vv_f32m1(_pA0, _pB1, _sum1, vl); - - pA += 1; - pB += 8; - } - - if (k_end) - { - // if (out_elempack == 1) - { - vse32_v_f32m1(outptr0, _sum0, vl); - vse32_v_f32m1(outptr0 + 4, _sum1, vl); - outptr0 += 8; - } - } - else - { - vse32_v_f32m1(outptr, _sum0, vl); - vse32_v_f32m1(outptr + 4, _sum1, vl); - } - - outptr += 8; - } - for (; jj + 3 < max_jj; jj += 4) + for (; jj + (packn - 1) < max_jj; jj += packn) { vfloat32m1_t _sum; if (k == 0) { - _sum = vfmv_v_f_f32m1(0.f, vl); + _sum = __riscv_vfmv_v_f_f32m1(0.f, vl); if (pC) { if (broadcast_type_C == 0 || broadcast_type_C == 1 || broadcast_type_C == 2) { - _sum = vfmv_v_f_f32m1(pC[0], vl); + _sum = __riscv_vfmv_v_f_f32m1(pC[0], vl); } if (broadcast_type_C == 3 || broadcast_type_C == 4) { - _sum = vle32_v_f32m1(pC, vl); - pC += 4; + _sum = __riscv_vle32_v_f32m1(pC, vl); + pC += packn; } } } else { - _sum = vle32_v_f32m1(outptr, vl); + _sum = __riscv_vle32_v_f32m1(outptr, vl); } const float* pA = pAT; int kk = 0; for (; kk < max_kk; kk += 1) { - vfloat32m1_t _pB = vle32_v_f32m1(pB, vl); - vfloat32m1_t _pA = vfmv_v_f_f32m1(pA[0], vl); + vfloat32m1_t _pB = __riscv_vle32_v_f32m1(pB, vl); + vfloat32m1_t _pA = __riscv_vfmv_v_f_f32m1(pA[0], vl); - _sum = vfmadd_vv_f32m1(_pA, _pB, _sum, vl); + _sum = __riscv_vfmadd_vv_f32m1(_pA, _pB, _sum, vl); pA += 1; - pB += 4; + pB += packn; } if (k_end) { // if (out_elempack == 1) { - vse32_v_f32m1(outptr0, _sum, vl); - outptr0 += 4; + __riscv_vse32_v_f32m1(outptr0, _sum, vl); + outptr0 += packn; } } else { - vse32_v_f32m1(outptr, _sum, vl); + __riscv_vse32_v_f32m1(outptr, _sum, vl); } - outptr += 4; + outptr += packn; } #endif // __riscv_vector for (; jj + 1 < max_jj; jj += 2) @@ -3529,8 +1437,14 @@ static void get_optimal_tile_mnk(int M, int N, int K, int constant_TILE_M, int c int tile_size = (int)sqrtf((float)l2_cache_size / 3 / sizeof(float)); - TILE_M = std::max(8, tile_size / 8 * 8); - TILE_N = std::max(4, tile_size / 4 * 4); +#if __riscv_vector + const int packn = csrr_vlenb() / 4; +#else + const int packn = 4; +#endif + + TILE_M = std::max(packn, tile_size / packn * packn); + TILE_N = std::max(packn, tile_size / packn * packn); TILE_K = std::max(8, tile_size / 8 * 8); if (K > 0) @@ -3541,8 +1455,8 @@ static void get_optimal_tile_mnk(int M, int N, int K, int constant_TILE_M, int c if (nn_K == 1) { tile_size = (int)((float)l2_cache_size / 2 / sizeof(float) / TILE_K); - TILE_M = std::max(8, tile_size / 8 * 8); - TILE_N = std::max(4, tile_size / 4 * 4); + TILE_M = std::max(packn, tile_size / packn * packn); + TILE_N = std::max(packn, tile_size / packn * packn); } } @@ -3551,29 +1465,29 @@ static void get_optimal_tile_mnk(int M, int N, int K, int constant_TILE_M, int c if (M > 0) { int nn_M = (M + TILE_M - 1) / TILE_M; - TILE_M = std::min(TILE_M, ((M + nn_M - 1) / nn_M + 7) / 8 * 8); + TILE_M = std::min(TILE_M, ((M + nn_M - 1) / nn_M + (packn - 1)) / packn * packn); } if (N > 0) { int nn_N = (N + TILE_N - 1) / TILE_N; - TILE_N = std::min(TILE_N, ((N + nn_N - 1) / nn_N + 3) / 4 * 4); + TILE_N = std::min(TILE_N, ((N + nn_N - 1) / nn_N + (packn - 1)) / packn * packn); } if (nT > 1) { - TILE_M = std::min(TILE_M, (std::max(1, TILE_M / nT) + 7) / 8 * 8); + TILE_M = std::min(TILE_M, (std::max(1, TILE_M / nT) + (packn - 1)) / packn * packn); } // always take constant TILE_M/N/K value when provided if (constant_TILE_M > 0) { - TILE_M = (constant_TILE_M + 7) / 8 * 8; + TILE_M = (constant_TILE_M + (packn - 1)) / packn * packn; } if (constant_TILE_N > 0) { - TILE_N = (constant_TILE_N + 3) / 4 * 4; + TILE_N = (constant_TILE_N + (packn - 1)) / packn * packn; } if (constant_TILE_K > 0) @@ -3582,7 +1496,7 @@ static void get_optimal_tile_mnk(int M, int N, int K, int constant_TILE_M, int c } } -static int gemm_riscv(const Mat& A, const Mat& B, const Mat& C, Mat& top_blob, int broadcast_type_C, int transA, int transB, int output_transpose, int constant_TILE_M, int constant_TILE_N, int constant_TILE_K, int nT, size_t vl, const Option& opt) +static int gemm_riscv(const Mat& A, const Mat& B, const Mat& C, Mat& top_blob, int broadcast_type_C, int transA, int transB, int output_transpose, int constant_TILE_M, int constant_TILE_N, int constant_TILE_K, int nT, const Option& opt) { const int M = transA ? A.w : (A.dims == 3 ? A.c : A.h) * A.elempack; const int K = transA ? (A.dims == 3 ? A.c : A.h) * A.elempack : A.w; @@ -3624,11 +1538,11 @@ static int gemm_riscv(const Mat& A, const Mat& B, const Mat& C, Mat& top_blob, i if (transB) { - pack_B_tile(B, BT_tile, j, max_jj, k, max_kk, vl); + pack_B_tile(B, BT_tile, j, max_jj, k, max_kk); } else { - transpose_pack_B_tile(B, BT_tile, j, max_jj, k, max_kk, vl); + transpose_pack_B_tile(B, BT_tile, j, max_jj, k, max_kk); } } @@ -3657,7 +1571,7 @@ static int gemm_riscv(const Mat& A, const Mat& B, const Mat& C, Mat& top_blob, i if (broadcast_type_C == 3) { - pack_A_tile(C, topT_tile, i, max_ii, j, max_jj, vl); + pack_A_tile(C, topT_tile, i, max_ii, j, max_jj); } const Mat& CT_tile = broadcast_type_C == 3 ? topT_tile : C; @@ -3676,21 +1590,21 @@ static int gemm_riscv(const Mat& A, const Mat& B, const Mat& C, Mat& top_blob, i { if (transA) { - transpose_pack_A_tile(A, AT_tile, i, max_ii, k, max_kk, vl); + transpose_pack_A_tile(A, AT_tile, i, max_ii, k, max_kk); } else { - pack_A_tile(A, AT_tile, i, max_ii, k, max_kk, vl); + pack_A_tile(A, AT_tile, i, max_ii, k, max_kk); } } bool k_end = !output_transpose && k + TILE_K >= K; - gemm_transB_packed_tile(AT_tile, BT_tile, CT_tile, topT_tile, top_blob, broadcast_type_C, i, max_ii, j, max_jj, k, max_kk, k_end, vl); + gemm_transB_packed_tile(AT_tile, BT_tile, CT_tile, topT_tile, top_blob, broadcast_type_C, i, max_ii, j, max_jj, k, max_kk, k_end); } if (output_transpose) { - transpose_unpack_output_tile(topT_tile, top_blob, i, max_ii, j, max_jj, vl); + transpose_unpack_output_tile(topT_tile, top_blob, i, max_ii, j, max_jj); } } } @@ -3698,7 +1612,7 @@ static int gemm_riscv(const Mat& A, const Mat& B, const Mat& C, Mat& top_blob, i return 0; } -static int gemm_AT_riscv(const Mat& AT, const Mat& B, const Mat& C, Mat& top_blob, int broadcast_type_C, int M, int K, int transB, int output_transpose, int constant_TILE_M, int constant_TILE_N, int constant_TILE_K, int nT, size_t vl, const Option& opt) +static int gemm_AT_riscv(const Mat& AT, const Mat& B, const Mat& C, Mat& top_blob, int broadcast_type_C, int M, int K, int transB, int output_transpose, int constant_TILE_M, int constant_TILE_N, int constant_TILE_K, int nT, const Option& opt) { const int N = transB ? (B.dims == 3 ? B.c : B.h) * B.elempack : B.w; @@ -3734,11 +1648,11 @@ static int gemm_AT_riscv(const Mat& AT, const Mat& B, const Mat& C, Mat& top_blo if (transB) { - pack_B_tile(B, BT_tile, j, max_jj, k, max_kk, vl); + pack_B_tile(B, BT_tile, j, max_jj, k, max_kk); } else { - transpose_pack_B_tile(B, BT_tile, j, max_jj, k, max_kk, vl); + transpose_pack_B_tile(B, BT_tile, j, max_jj, k, max_kk); } } @@ -3762,7 +1676,7 @@ static int gemm_AT_riscv(const Mat& AT, const Mat& B, const Mat& C, Mat& top_blo if (broadcast_type_C == 3) { - pack_A_tile(C, topT_tile, i, max_ii, j, max_jj, vl); + pack_A_tile(C, topT_tile, i, max_ii, j, max_jj); } const Mat& CT_tile = broadcast_type_C == 3 ? topT_tile : C; @@ -3777,12 +1691,12 @@ static int gemm_AT_riscv(const Mat& AT, const Mat& B, const Mat& C, Mat& top_blo Mat BT_tile = BT.channel(j / TILE_N).row_range(k / TILE_K, 1); bool k_end = !output_transpose && k + TILE_K >= K; - gemm_transB_packed_tile(AT_tile, BT_tile, CT_tile, topT_tile, top_blob, broadcast_type_C, i, max_ii, j, max_jj, k, max_kk, k_end, vl); + gemm_transB_packed_tile(AT_tile, BT_tile, CT_tile, topT_tile, top_blob, broadcast_type_C, i, max_ii, j, max_jj, k, max_kk, k_end); } if (output_transpose) { - transpose_unpack_output_tile(topT_tile, top_blob, i, max_ii, j, max_jj, vl); + transpose_unpack_output_tile(topT_tile, top_blob, i, max_ii, j, max_jj); } } } @@ -3790,7 +1704,7 @@ static int gemm_AT_riscv(const Mat& AT, const Mat& B, const Mat& C, Mat& top_blo return 0; } -static int gemm_BT_riscv(const Mat& A, const Mat& BT, const Mat& C, Mat& top_blob, int broadcast_type_C, int N, int K, int transA, int output_transpose, int constant_TILE_M, int constant_TILE_N, int constant_TILE_K, int nT, size_t vl, const Option& opt) +static int gemm_BT_riscv(const Mat& A, const Mat& BT, const Mat& C, Mat& top_blob, int broadcast_type_C, int N, int K, int transA, int output_transpose, int constant_TILE_M, int constant_TILE_N, int constant_TILE_K, int nT, const Option& opt) { const int M = transA ? A.w : (A.dims == 3 ? A.c : A.h) * A.elempack; @@ -3831,7 +1745,7 @@ static int gemm_BT_riscv(const Mat& A, const Mat& BT, const Mat& C, Mat& top_blo if (broadcast_type_C == 3) { - pack_A_tile(C, topT_tile, i, max_ii, j, max_jj, vl); + pack_A_tile(C, topT_tile, i, max_ii, j, max_jj); } const Mat& CT_tile = broadcast_type_C == 3 ? topT_tile : C; @@ -3850,22 +1764,22 @@ static int gemm_BT_riscv(const Mat& A, const Mat& BT, const Mat& C, Mat& top_blo { if (transA) { - transpose_pack_A_tile(A, AT_tile, i, max_ii, k, max_kk, vl); + transpose_pack_A_tile(A, AT_tile, i, max_ii, k, max_kk); } else { - pack_A_tile(A, AT_tile, i, max_ii, k, max_kk, vl); + pack_A_tile(A, AT_tile, i, max_ii, k, max_kk); } } bool k_end = !output_transpose && k + TILE_K >= K; - gemm_transB_packed_tile(AT_tile, BT_tile, CT_tile, topT_tile, top_blob, broadcast_type_C, i, max_ii, j, max_jj, k, max_kk, k_end, vl); + gemm_transB_packed_tile(AT_tile, BT_tile, CT_tile, topT_tile, top_blob, broadcast_type_C, i, max_ii, j, max_jj, k, max_kk, k_end); } if (output_transpose) { - transpose_unpack_output_tile(topT_tile, top_blob, i, max_ii, j, max_jj, vl); + transpose_unpack_output_tile(topT_tile, top_blob, i, max_ii, j, max_jj); } } } @@ -3873,7 +1787,7 @@ static int gemm_BT_riscv(const Mat& A, const Mat& BT, const Mat& C, Mat& top_blo return 0; } -static int gemm_AT_BT_riscv(const Mat& AT, const Mat& BT, const Mat& C, Mat& top_blob, int broadcast_type_C, int M, int N, int K, int output_transpose, int constant_TILE_M, int constant_TILE_N, int constant_TILE_K, int nT, size_t vl, const Option& opt) +static int gemm_AT_BT_riscv(const Mat& AT, const Mat& BT, const Mat& C, Mat& top_blob, int broadcast_type_C, int M, int N, int K, int output_transpose, int constant_TILE_M, int constant_TILE_N, int constant_TILE_K, int nT, const Option& opt) { // NCNN_LOGE("M/N/K = %d %d %d", M, N, K); @@ -3906,7 +1820,7 @@ static int gemm_AT_BT_riscv(const Mat& AT, const Mat& BT, const Mat& C, Mat& top if (broadcast_type_C == 3) { - pack_A_tile(C, topT_tile, i, max_ii, j, max_jj, vl); + pack_A_tile(C, topT_tile, i, max_ii, j, max_jj); } const Mat& CT_tile = broadcast_type_C == 3 ? topT_tile : C; @@ -3923,12 +1837,12 @@ static int gemm_AT_BT_riscv(const Mat& AT, const Mat& BT, const Mat& C, Mat& top bool k_end = !output_transpose && k + TILE_K >= K; - gemm_transB_packed_tile(AT_tile, BT_tile, CT_tile, topT_tile, top_blob, broadcast_type_C, i, max_ii, j, max_jj, k, max_kk, k_end, vl); + gemm_transB_packed_tile(AT_tile, BT_tile, CT_tile, topT_tile, top_blob, broadcast_type_C, i, max_ii, j, max_jj, k, max_kk, k_end); } if (output_transpose) { - transpose_unpack_output_tile(topT_tile, top_blob, i, max_ii, j, max_jj, vl); + transpose_unpack_output_tile(topT_tile, top_blob, i, max_ii, j, max_jj); } } } @@ -3983,11 +1897,11 @@ int Gemm_riscv::create_pipeline(const Option& opt) if (transA) { - transpose_pack_A_tile(A_data, AT_tile, i, max_ii, k, max_kk, vl); + transpose_pack_A_tile(A_data, AT_tile, i, max_ii, k, max_kk); } else { - pack_A_tile(A_data, AT_tile, i, max_ii, k, max_kk, vl); + pack_A_tile(A_data, AT_tile, i, max_ii, k, max_kk); } } } @@ -4024,11 +1938,11 @@ int Gemm_riscv::create_pipeline(const Option& opt) if (transB) { - pack_B_tile(B_data, BT_tile, j, max_jj, k, max_kk, vl); + pack_B_tile(B_data, BT_tile, j, max_jj, k, max_kk); } else { - transpose_pack_B_tile(B_data, BT_tile, j, max_jj, k, max_kk, vl); + transpose_pack_B_tile(B_data, BT_tile, j, max_jj, k, max_kk); } } } @@ -4042,9 +1956,11 @@ int Gemm_riscv::create_pipeline(const Option& opt) CT_data = C_data; #if __riscv_vector + const int packn = csrr_vlenb() / 4; + if (constant_broadcast_type_C == 3 && opt.use_packing_layout) { - int C_elempack = constantM % 4 == 0 ? 4 : 1; + int C_elempack = constantM % packn == 0 ? packn : 1; convert_packing(C_data, CT_data, C_elempack, opt); } #endif // __riscv_vector @@ -4189,12 +2105,16 @@ int Gemm_riscv::forward(const std::vector& bottom_blobs, std::vector& } } +#if __riscv_vector + const int packn = csrr_vlenb() / 4; +#endif + int out_elempack = 1; #if __riscv_vector if (opt.use_packing_layout) { int outh = output_transpose ? N : M; - out_elempack = outh % 4 == 0 ? 4 : 1; + out_elempack = outh % packn == 0 ? packn : 1; } #endif // __riscv_vector if (output_elempack) @@ -4230,23 +2150,23 @@ int Gemm_riscv::forward(const std::vector& bottom_blobs, std::vector& int ret = 0; if (constantA && constantB) { - ret = gemm_AT_BT_riscv(AT_data, BT_data, C, top_blob, broadcast_type_C, constantM, constantN, constantK, output_transpose, constant_TILE_M, constant_TILE_N, constant_TILE_K, _nT, vl, opt); + ret = gemm_AT_BT_riscv(AT_data, BT_data, C, top_blob, broadcast_type_C, constantM, constantN, constantK, output_transpose, constant_TILE_M, constant_TILE_N, constant_TILE_K, _nT, opt); } else if (constantA) { const Mat& B = bottom_blobs[0]; - ret = gemm_AT_riscv(AT_data, B, C, top_blob, broadcast_type_C, constantM, constantK, transB, output_transpose, constant_TILE_M, constant_TILE_N, constant_TILE_K, _nT, vl, opt); + ret = gemm_AT_riscv(AT_data, B, C, top_blob, broadcast_type_C, constantM, constantK, transB, output_transpose, constant_TILE_M, constant_TILE_N, constant_TILE_K, _nT, opt); } else if (constantB) { const Mat& A = bottom_blobs[0]; - ret = gemm_BT_riscv(A, BT_data, C, top_blob, broadcast_type_C, constantN, constantK, transA, output_transpose, constant_TILE_M, constant_TILE_N, constant_TILE_K, _nT, vl, opt); + ret = gemm_BT_riscv(A, BT_data, C, top_blob, broadcast_type_C, constantN, constantK, transA, output_transpose, constant_TILE_M, constant_TILE_N, constant_TILE_K, _nT, opt); } else { const Mat& A = bottom_blobs[0]; const Mat& B = bottom_blobs[1]; - ret = gemm_riscv(A, B, C, top_blob, broadcast_type_C, transA, transB, output_transpose, constant_TILE_M, constant_TILE_N, constant_TILE_K, _nT, vl, opt); + ret = gemm_riscv(A, B, C, top_blob, broadcast_type_C, transA, transB, output_transpose, constant_TILE_M, constant_TILE_N, constant_TILE_K, _nT, opt); } if (ret != 0) return ret; diff --git a/src/layer/riscv/gemm_riscv.h b/src/layer/riscv/gemm_riscv.h index 6bca092fb1f..967a9ee12c9 100644 --- a/src/layer/riscv/gemm_riscv.h +++ b/src/layer/riscv/gemm_riscv.h @@ -30,9 +30,8 @@ class Gemm_riscv : public Gemm virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; - // public: +public: int nT; - size_t vl; Mat AT_data; Mat BT_data; Mat CT_data; diff --git a/src/layer/riscv/gru_riscv.cpp b/src/layer/riscv/gru_riscv.cpp index 25218ddc32e..97d7cb82eb0 100644 --- a/src/layer/riscv/gru_riscv.cpp +++ b/src/layer/riscv/gru_riscv.cpp @@ -18,6 +18,8 @@ #include #endif // __riscv_vector +#include "cpu.h" + namespace ncnn { //core rvv-optimized gru impl. @@ -63,20 +65,20 @@ static int gru(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& we const float* ptr_xcu = weight_xc_U; while (n > 0) { - size_t vl = vsetvl_e32m8(n); - vfloat32m8_t _x = vle32_v_f32m8(ptr_x, vl); - vfloat32m8_t _xcr = vle32_v_f32m8(ptr_xcr, vl); - vfloat32m8_t _xcu = vle32_v_f32m8(ptr_xcu, vl); - vfloat32m1_t _scalar_r = vfmv_s_f_f32m1(vundefined_f32m1(), R, vl); - vfloat32m1_t _scalar_u = vfmv_s_f_f32m1(vundefined_f32m1(), U, vl); + size_t vl = __riscv_vsetvl_e32m8(n); + vfloat32m8_t _x = __riscv_vle32_v_f32m8(ptr_x, vl); + vfloat32m8_t _xcr = __riscv_vle32_v_f32m8(ptr_xcr, vl); + vfloat32m8_t _xcu = __riscv_vle32_v_f32m8(ptr_xcu, vl); + vfloat32m1_t _scalar_r = __riscv_vfmv_s_f_f32m1(R, vl); + vfloat32m1_t _scalar_u = __riscv_vfmv_s_f_f32m1(U, vl); - _xcr = vfmul_vv_f32m8(_xcr, _x, vl); - _xcu = vfmul_vv_f32m8(_xcu, _x, vl); - _scalar_r = vfredusum_vs_f32m8_f32m1(_scalar_r, _xcr, _scalar_r, vl); - _scalar_u = vfredusum_vs_f32m8_f32m1(_scalar_u, _xcu, _scalar_u, vl); + _xcr = __riscv_vfmul_vv_f32m8(_xcr, _x, vl); + _xcu = __riscv_vfmul_vv_f32m8(_xcu, _x, vl); + _scalar_r = __riscv_vfredusum_vs_f32m8_f32m1(_xcr, _scalar_r, vl); + _scalar_u = __riscv_vfredusum_vs_f32m8_f32m1(_xcu, _scalar_u, vl); - R = vfmv_f_s_f32m1_f32(_scalar_r); - U = vfmv_f_s_f32m1_f32(_scalar_u); + R = __riscv_vfmv_f_s_f32m1_f32(_scalar_r); + U = __riscv_vfmv_f_s_f32m1_f32(_scalar_u); ptr_x += vl; ptr_xcr += vl; @@ -93,20 +95,20 @@ static int gru(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& we const float* ptr_hcu = weight_hc_U; while (n_out > 0) { - size_t vl = vsetvl_e32m8(n_out); - vfloat32m8_t _h_cont = vle32_v_f32m8(ptr_hc, vl); - vfloat32m8_t _hcr = vle32_v_f32m8(ptr_hcr, vl); - vfloat32m8_t _hcu = vle32_v_f32m8(ptr_hcu, vl); - vfloat32m1_t _scalar_r = vfmv_s_f_f32m1(vundefined_f32m1(), R, vl); - vfloat32m1_t _scalar_u = vfmv_s_f_f32m1(vundefined_f32m1(), U, vl); + size_t vl = __riscv_vsetvl_e32m8(n_out); + vfloat32m8_t _h_cont = __riscv_vle32_v_f32m8(ptr_hc, vl); + vfloat32m8_t _hcr = __riscv_vle32_v_f32m8(ptr_hcr, vl); + vfloat32m8_t _hcu = __riscv_vle32_v_f32m8(ptr_hcu, vl); + vfloat32m1_t _scalar_r = __riscv_vfmv_s_f_f32m1(R, vl); + vfloat32m1_t _scalar_u = __riscv_vfmv_s_f_f32m1(U, vl); - _hcr = vfmul_vv_f32m8(_hcr, _h_cont, vl); - _hcu = vfmul_vv_f32m8(_hcu, _h_cont, vl); - _scalar_r = vfredusum_vs_f32m8_f32m1(_scalar_r, _hcr, _scalar_r, vl); - _scalar_u = vfredusum_vs_f32m8_f32m1(_scalar_u, _hcu, _scalar_u, vl); + _hcr = __riscv_vfmul_vv_f32m8(_hcr, _h_cont, vl); + _hcu = __riscv_vfmul_vv_f32m8(_hcu, _h_cont, vl); + _scalar_r = __riscv_vfredusum_vs_f32m8_f32m1(_hcr, _scalar_r, vl); + _scalar_u = __riscv_vfredusum_vs_f32m8_f32m1(_hcu, _scalar_u, vl); - R = vfmv_f_s_f32m1_f32(_scalar_r); - U = vfmv_f_s_f32m1_f32(_scalar_u); + R = __riscv_vfmv_f_s_f32m1_f32(_scalar_r); + U = __riscv_vfmv_f_s_f32m1_f32(_scalar_u); ptr_hc += vl; ptr_hcr += vl; @@ -136,16 +138,16 @@ static int gru(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& we const float* ptr_whc_n = weight_hc_N; while (n_out2 > 0) { - size_t vl = vsetvl_e32m8(n_out2); + size_t vl = __riscv_vsetvl_e32m8(n_out2); - vfloat32m8_t _h_cont = vle32_v_f32m8(ptr_hc2, vl); - vfloat32m8_t _whc_n = vle32_v_f32m8(ptr_whc_n, vl); - vfloat32m1_t _scalar_n = vfmv_s_f_f32m1(vundefined_f32m1(), N, vl); + vfloat32m8_t _h_cont = __riscv_vle32_v_f32m8(ptr_hc2, vl); + vfloat32m8_t _whc_n = __riscv_vle32_v_f32m8(ptr_whc_n, vl); + vfloat32m1_t _scalar_n = __riscv_vfmv_s_f_f32m1(N, vl); - _h_cont = vfmul_vv_f32m8(_whc_n, _h_cont, vl); - _scalar_n = vfredusum_vs_f32m8_f32m1(_scalar_n, _h_cont, _scalar_n, vl); + _h_cont = __riscv_vfmul_vv_f32m8(_whc_n, _h_cont, vl); + _scalar_n = __riscv_vfredusum_vs_f32m8_f32m1(_h_cont, _scalar_n, vl); - N = vfmv_f_s_f32m1_f32(_scalar_n); + N = __riscv_vfmv_f_s_f32m1_f32(_scalar_n); n_out2 -= vl; ptr_hc2 += vl; ptr_whc_n += vl; @@ -160,15 +162,15 @@ static int gru(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& we const float* ptr_xcn = weight_xc_N; while (n2 > 0) { - size_t vl = vsetvl_e32m8(n2); + size_t vl = __riscv_vsetvl_e32m8(n2); - vfloat32m8_t _x = vle32_v_f32m8(ptr_x2, vl); - vfloat32m8_t _xcn = vle32_v_f32m8(ptr_xcn, vl); - vfloat32m1_t _scalar_n = vfmv_s_f_f32m1(vundefined_f32m1(), N, vl); + vfloat32m8_t _x = __riscv_vle32_v_f32m8(ptr_x2, vl); + vfloat32m8_t _xcn = __riscv_vle32_v_f32m8(ptr_xcn, vl); + vfloat32m1_t _scalar_n = __riscv_vfmv_s_f_f32m1(N, vl); - _xcn = vfmul_vv_f32m8(_x, _xcn, vl); - _scalar_n = vfredusum_vs_f32m8_f32m1(_scalar_n, _xcn, _scalar_n, vl); - N = vfmv_f_s_f32m1_f32(_scalar_n); + _xcn = __riscv_vfmul_vv_f32m8(_x, _xcn, vl); + _scalar_n = __riscv_vfredusum_vs_f32m8_f32m1(_xcn, _scalar_n, vl); + N = __riscv_vfmv_f_s_f32m1_f32(_scalar_n); n2 -= vl; ptr_x2 += vl; @@ -208,8 +210,12 @@ static int gru(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& we GRU_riscv::GRU_riscv() { -#if __riscv_vector && __riscv_zfh - support_fp16_storage = true; +#if NCNN_ZFH +#if __riscv_vector + support_fp16_storage = cpu_support_riscv_zvfh(); +#else + support_fp16_storage = cpu_support_riscv_zfh(); +#endif #endif } @@ -223,7 +229,7 @@ int GRU_riscv::create_pipeline(const Option& opt) } #endif -#if __riscv_vector && __riscv_zfh +#if NCNN_ZFH if (opt.use_fp16_storage && opt.use_fp16_arithmetic) return create_pipeline_fp16sa(opt); #endif @@ -240,10 +246,9 @@ int GRU_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) } #endif +#if NCNN_ZFH int elembits = bottom_blob.elembits(); -#if __riscv_vector -#if __riscv_zfh if (opt.use_fp16_storage && elembits == 16) { if (opt.use_fp16_arithmetic) @@ -253,6 +258,8 @@ int GRU_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) } #endif +#if __riscv_vector + int T = bottom_blob.h; int num_directions = direction == 2 ? 2 : 1; @@ -322,10 +329,10 @@ int GRU_riscv::forward(const std::vector& bottom_blobs, std::vector& t #endif const Mat& bottom_blob = bottom_blobs[0]; + +#if NCNN_ZFH int elembits = bottom_blob.elembits(); -#if __riscv_vector -#if __riscv_zfh if (opt.use_fp16_storage && elembits == 16) { if (opt.use_fp16_arithmetic) @@ -335,6 +342,7 @@ int GRU_riscv::forward(const std::vector& bottom_blobs, std::vector& t } #endif +#if __riscv_vector int T = bottom_blob.h; int num_directions = direction == 2 ? 2 : 1; @@ -408,648 +416,4 @@ int GRU_riscv::forward(const std::vector& bottom_blobs, std::vector& t return GRU::forward(bottom_blobs, top_blobs, opt); } -#if __riscv_vector && __riscv_zfh -static int gru_fp16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& weight_xc, const Mat& bias_c, const Mat& weight_hc, Mat& hidden_state, const Option& opt) -{ - int size = bottom_blob.w; - int T = bottom_blob.h; - - int num_output = top_blob.w; - - // 2 x num_output - Mat gates(2, num_output, 4u, opt.workspace_allocator); - if (gates.empty()) - return -100; - - // unroll - for (int t = 0; t < T; t++) - { - int ti = reverse ? T - 1 - t : t; - - const __fp16* x = bottom_blob.row(ti); - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < num_output; q++) - { - float* gates_data = gates.row(q); - - // gate reset update - const float* bias_c_R = bias_c.row(0); - const float* bias_c_U = bias_c.row(1); - - const float* weight_xc_R = weight_xc.row(num_output * 0 + q); - const float* weight_xc_U = weight_xc.row(num_output * 1 + q); - const float* weight_hc_R = weight_hc.row(num_output * 0 + q); - const float* weight_hc_U = weight_hc.row(num_output * 1 + q); - - float R = bias_c_R[q]; - float U = bias_c_U[q]; - - int n = size; - const __fp16* ptr_x = x; - const float* ptr_xcr = weight_xc_R; - const float* ptr_xcu = weight_xc_U; - while (n > 0) - { - size_t vl = vsetvl_e16m4(n); - vfloat32m8_t _x = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr_x, vl), vl); - vfloat32m8_t _xcr = vle32_v_f32m8(ptr_xcr, vl); - vfloat32m8_t _xcu = vle32_v_f32m8(ptr_xcu, vl); - vfloat32m1_t _scalar_r = vfmv_s_f_f32m1(vundefined_f32m1(), R, vl); - vfloat32m1_t _scalar_u = vfmv_s_f_f32m1(vundefined_f32m1(), U, vl); - - _xcr = vfmul_vv_f32m8(_xcr, _x, vl); - _xcu = vfmul_vv_f32m8(_xcu, _x, vl); - _scalar_r = vfredusum_vs_f32m8_f32m1(_scalar_r, _xcr, _scalar_r, vl); - _scalar_u = vfredusum_vs_f32m8_f32m1(_scalar_u, _xcu, _scalar_u, vl); - - R = vfmv_f_s_f32m1_f32(_scalar_r); - U = vfmv_f_s_f32m1_f32(_scalar_u); - - ptr_x += vl; - ptr_xcr += vl; - ptr_xcu += vl; - n -= vl; - } - ptr_x = NULL; - ptr_xcr = NULL; - ptr_xcu = NULL; - - int n_out = num_output; - const float* ptr_hc = hidden_state; - const float* ptr_hcr = weight_hc_R; - const float* ptr_hcu = weight_hc_U; - while (n_out > 0) - { - size_t vl = vsetvl_e16m4(n_out); - vfloat32m8_t _h_cont = vle32_v_f32m8(ptr_hc, vl); - vfloat32m8_t _hcr = vle32_v_f32m8(ptr_hcr, vl); - vfloat32m8_t _hcu = vle32_v_f32m8(ptr_hcu, vl); - vfloat32m1_t _scalar_r = vfmv_s_f_f32m1(vundefined_f32m1(), R, vl); - vfloat32m1_t _scalar_u = vfmv_s_f_f32m1(vundefined_f32m1(), U, vl); - - _hcr = vfmul_vv_f32m8(_hcr, _h_cont, vl); - _hcu = vfmul_vv_f32m8(_hcu, _h_cont, vl); - _scalar_r = vfredusum_vs_f32m8_f32m1(_scalar_r, _hcr, _scalar_r, vl); - _scalar_u = vfredusum_vs_f32m8_f32m1(_scalar_u, _hcu, _scalar_u, vl); - - R = vfmv_f_s_f32m1_f32(_scalar_r); - U = vfmv_f_s_f32m1_f32(_scalar_u); - - ptr_hc += vl; - ptr_hcr += vl; - ptr_hcu += vl; - n_out -= vl; - } - ptr_hc = NULL; - ptr_hcr = NULL; - ptr_hcu = NULL; - - // sigmoid(R) - // sigmoid(U) - R = 1.f / (1.f + exp(-R)); - U = 1.f / (1.f + exp(-U)); - - // gate new - const float* bias_c_WN = bias_c.row(2); - const float* bias_c_BN = bias_c.row(3); - - const float* weight_xc_N = weight_xc.row(num_output * 2 + q); - const float* weight_hc_N = weight_hc.row(num_output * 2 + q); - - float N = bias_c_BN[q]; - - int n_out2 = num_output; - const float* ptr_hc2 = hidden_state; - const float* ptr_whc_n = weight_hc_N; - while (n_out2 > 0) - { - size_t vl = vsetvl_e16m4(n_out2); - - vfloat32m8_t _h_cont = vle32_v_f32m8(ptr_hc2, vl); - vfloat32m8_t _whc_n = vle32_v_f32m8(ptr_whc_n, vl); - vfloat32m1_t _scalar_n = vfmv_s_f_f32m1(vundefined_f32m1(), N, vl); - - _h_cont = vfmul_vv_f32m8(_whc_n, _h_cont, vl); - _scalar_n = vfredusum_vs_f32m8_f32m1(_scalar_n, _h_cont, _scalar_n, vl); - - N = vfmv_f_s_f32m1_f32(_scalar_n); - n_out2 -= vl; - ptr_hc2 += vl; - ptr_whc_n += vl; - } - ptr_hc2 = NULL; - ptr_whc_n = NULL; - - N = bias_c_WN[q] + R * N; - - int n2 = size; - const __fp16* ptr_x2 = x; - const float* ptr_xcn = weight_xc_N; - while (n2 > 0) - { - size_t vl = vsetvl_e16m4(n2); - - vfloat32m8_t _x = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr_x2, vl), vl); - vfloat32m8_t _xcn = vle32_v_f32m8(ptr_xcn, vl); - vfloat32m1_t _scalar_n = vfmv_s_f_f32m1(vundefined_f32m1(), N, vl); - - _xcn = vfmul_vv_f32m8(_x, _xcn, vl); - _scalar_n = vfredusum_vs_f32m8_f32m1(_scalar_n, _xcn, _scalar_n, vl); - N = vfmv_f_s_f32m1_f32(_scalar_n); - - n2 -= vl; - ptr_x2 += vl; - ptr_xcn += vl; - } - ptr_x2 = NULL; - ptr_xcn = NULL; - - // tanh(N) - N = tanh(N); - - gates_data[0] = U; - gates_data[1] = N; - } - - // h_t := (1 - update) .* new + update .* h_{t-1} - __fp16* output_data = top_blob.row<__fp16>(ti); - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < num_output; q++) - { - const float* gates_data = gates.row(q); - - float U = gates_data[0]; - float N = gates_data[1]; - - float H = (1 - U) * N + U * hidden_state[q]; - - hidden_state[q] = H; - output_data[q] = (__fp16)H; - } - } - - return 0; -} - -int GRU_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const -{ - int T = bottom_blob.h; - - int num_directions = direction == 2 ? 2 : 1; - // initial hidden state - Mat hidden(num_output, 4u, opt.workspace_allocator); - if (hidden.empty()) - return -100; - hidden.fill(0.f); - - top_blob.create(num_output * num_directions, T, 2u, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - // Uni directional - if (direction == 0 || direction == 1) - { - int ret = gru_fp16s(bottom_blob, top_blob, direction, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), hidden, opt); - if (ret != 0) - return ret; - } - - if (direction == 2) - { - Mat top_blob_forward(num_output, T, 2u, opt.workspace_allocator); - if (top_blob_forward.empty()) - return -100; - - Mat top_blob_reverse(num_output, T, 2u, opt.workspace_allocator); - if (top_blob_reverse.empty()) - return -100; - - int ret0 = gru_fp16s(bottom_blob, top_blob_forward, 0, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), hidden, opt); - if (ret0 != 0) - return ret0; - - hidden.fill(0.0f); - - int ret1 = gru_fp16s(bottom_blob, top_blob_reverse, 1, weight_xc_data.channel(1), bias_c_data.channel(1), weight_hc_data.channel(1), hidden, opt); - if (ret1 != 0) - return ret1; - - // concat w - for (int i = 0; i < T; i++) - { - const __fp16* pf = top_blob_forward.row(i); - const __fp16* pr = top_blob_reverse.row(i); - __fp16* ptr = top_blob.row<__fp16>(i); - - memcpy(ptr, pf, num_output * sizeof(__fp16)); - memcpy(ptr + num_output, pr, num_output * sizeof(__fp16)); - } - } - - return 0; -} - -int GRU_riscv::forward_fp16s(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const -{ - const Mat& bottom_blob = bottom_blobs[0]; - int T = bottom_blob.h; - int num_directions = direction == 2 ? 2 : 1; - - Mat hidden; - Allocator* hidden_allocator = top_blobs.size() == 2 ? opt.blob_allocator : opt.workspace_allocator; - if (bottom_blobs.size() == 2) - { - Option opt_cast = opt; - opt_cast.blob_allocator = hidden_allocator; - cast_float16_to_float32(bottom_blobs[1], hidden, opt_cast); - } - else - { - hidden.create(num_output, num_directions, 4u, hidden_allocator); - if (hidden.empty()) - return -100; - hidden.fill(0.f); - } - - Mat& top_blob = top_blobs[0]; - top_blob.create(num_output * num_directions, T, 2u, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - // Uni directional - if (direction == 0 || direction == 1) - { - Mat hidden0 = hidden.row_range(0, 1); - int ret = gru_fp16s(bottom_blob, top_blob, direction, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), hidden0, opt); - if (ret != 0) - return ret; - } - - if (direction == 2) - { - Mat top_blob_forward(num_output, T, 2u, opt.workspace_allocator); - if (top_blob_forward.empty()) - return -100; - - Mat top_blob_reverse(num_output, T, 2u, opt.workspace_allocator); - if (top_blob_reverse.empty()) - return -100; - - Mat hidden0 = hidden.row_range(0, 1); - int ret0 = gru_fp16s(bottom_blob, top_blob_forward, 0, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), hidden0, opt); - if (ret0 != 0) - return ret0; - - Mat hidden1 = hidden.row_range(1, 1); - int ret1 = gru_fp16s(bottom_blob, top_blob_reverse, 1, weight_xc_data.channel(1), bias_c_data.channel(1), weight_hc_data.channel(1), hidden1, opt); - if (ret1 != 0) - return ret1; - - // concat w - for (int i = 0; i < T; i++) - { - const __fp16* pf = top_blob_forward.row(i); - const __fp16* pr = top_blob_reverse.row(i); - __fp16* ptr = top_blob.row<__fp16>(i); - - memcpy(ptr, pf, num_output * sizeof(__fp16)); - memcpy(ptr + num_output, pr, num_output * sizeof(__fp16)); - } - } - - if (top_blobs.size() == 2) - { - cast_float32_to_float16(hidden, top_blobs[1], opt); - } - - return 0; -} - -#endif - -//fp16sa start at here -#if __riscv_vector && __riscv_zfh - -int GRU_riscv::create_pipeline_fp16sa(const Option& opt) -{ - cast_float32_to_float16(weight_xc_data, weight_xc_data_fp16sa, opt); - cast_float32_to_float16(weight_hc_data, weight_hc_data_fp16sa, opt); - cast_float32_to_float16(bias_c_data, bias_c_data_fp16sa, opt); - - if (opt.lightmode) - { - weight_xc_data.release(); - bias_c_data.release(); - weight_hc_data.release(); - } - - return 0; -} - -static int gru_fp16sa(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& weight_xc, const Mat& bias_c, const Mat& weight_hc, Mat& hidden_state, const Option& opt) -{ - int size = bottom_blob.w; - int T = bottom_blob.h; - - int num_output = top_blob.w; - - // 2 x num_output - Mat gates(2, num_output, 4u, opt.workspace_allocator); - if (gates.empty()) - return -100; - - // unroll - for (int t = 0; t < T; t++) - { - int ti = reverse ? T - 1 - t : t; - - const __fp16* x = bottom_blob.row(ti); - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < num_output; q++) - { - float* gates_data = gates.row(q); - - // gate reset update - const __fp16* bias_c_R = bias_c.row(0); - const __fp16* bias_c_U = bias_c.row(1); - - const __fp16* weight_xc_R = weight_xc.row(num_output * 0 + q); - const __fp16* weight_xc_U = weight_xc.row(num_output * 1 + q); - const __fp16* weight_hc_R = weight_hc.row(num_output * 0 + q); - const __fp16* weight_hc_U = weight_hc.row(num_output * 1 + q); - - __fp16 R = bias_c_R[q]; - __fp16 U = bias_c_U[q]; - - int n = size; - const __fp16* ptr_x = x; - const __fp16* ptr_xcr = weight_xc_R; - const __fp16* ptr_xcu = weight_xc_U; - while (n > 0) - { - size_t vl = vsetvl_e16m8(n); - vfloat16m8_t _x = vle16_v_f16m8(ptr_x, vl); - vfloat16m8_t _xcr = vle16_v_f16m8(ptr_xcr, vl); - vfloat16m8_t _xcu = vle16_v_f16m8(ptr_xcu, vl); - vfloat16m1_t _scalar_r = vfmv_s_f_f16m1(vundefined_f16m1(), R, vl); - vfloat16m1_t _scalar_u = vfmv_s_f_f16m1(vundefined_f16m1(), U, vl); - - _xcr = vfmul_vv_f16m8(_xcr, _x, vl); - _xcu = vfmul_vv_f16m8(_xcu, _x, vl); - _scalar_r = vfredusum_vs_f16m8_f16m1(_scalar_r, _xcr, _scalar_r, vl); - _scalar_u = vfredusum_vs_f16m8_f16m1(_scalar_u, _xcu, _scalar_u, vl); - - R = vfmv_f_s_f16m1_f16(_scalar_r); - U = vfmv_f_s_f16m1_f16(_scalar_u); - - ptr_x += vl; - ptr_xcr += vl; - ptr_xcu += vl; - n -= vl; - } - - int n_out = num_output; - const float* ptr_hc = hidden_state; - const __fp16* ptr_hcr = weight_hc_R; - const __fp16* ptr_hcu = weight_hc_U; - while (n_out > 0) - { - size_t vl = vsetvl_e16m4(n_out); - vfloat16m4_t _h_cont = vfncvt_f_f_w_f16m4(vle32_v_f32m8(ptr_hc, vl), vl); - vfloat16m4_t _hcr = vle16_v_f16m4(ptr_hcr, vl); - vfloat16m4_t _hcu = vle16_v_f16m4(ptr_hcu, vl); - vfloat16m1_t _scalar_r = vfmv_s_f_f16m1(vundefined_f16m1(), R, vl); - vfloat16m1_t _scalar_u = vfmv_s_f_f16m1(vundefined_f16m1(), U, vl); - - _hcr = vfmul_vv_f16m4(_hcr, _h_cont, vl); - _hcu = vfmul_vv_f16m4(_hcu, _h_cont, vl); - _scalar_r = vfredusum_vs_f16m4_f16m1(_scalar_r, _hcr, _scalar_r, vl); - _scalar_u = vfredusum_vs_f16m4_f16m1(_scalar_u, _hcu, _scalar_u, vl); - - R = vfmv_f_s_f16m1_f16(_scalar_r); - U = vfmv_f_s_f16m1_f16(_scalar_u); - - ptr_hc += vl; - ptr_hcr += vl; - ptr_hcu += vl; - n_out -= vl; - } - - // sigmoid(R) - // sigmoid(U) - R = 1.f / (1.f + (__fp16)exp((float)(-R))); - U = 1.f / (1.f + (__fp16)exp((float)(-U))); - - // gate new - const __fp16* bias_c_WN = bias_c.row(2); - const __fp16* bias_c_BN = bias_c.row(3); - - const __fp16* weight_xc_N = weight_xc.row(num_output * 2 + q); - const __fp16* weight_hc_N = weight_hc.row(num_output * 2 + q); - - __fp16 N = bias_c_BN[q]; - - int n_out2 = num_output; - const float* ptr_hc2 = hidden_state; - const __fp16* ptr_whc_n = weight_hc_N; - while (n_out2 > 0) - { - size_t vl = vsetvl_e16m4(n_out2); - - vfloat16m4_t _h_cont = vfncvt_f_f_w_f16m4(vle32_v_f32m8(ptr_hc2, vl), vl); - vfloat16m4_t _whc_n = vle16_v_f16m4(ptr_whc_n, vl); - vfloat16m1_t _scalar_n = vfmv_s_f_f16m1(vundefined_f16m1(), N, vl); - - _h_cont = vfmul_vv_f16m4(_whc_n, _h_cont, vl); - _scalar_n = vfredusum_vs_f16m4_f16m1(_scalar_n, _h_cont, _scalar_n, vl); - - N = vfmv_f_s_f16m1_f16(_scalar_n); - n_out2 -= vl; - ptr_hc2 += vl; - ptr_whc_n += vl; - } - N = bias_c_WN[q] + R * N; - - int n2 = size; - const __fp16* ptr_x2 = x; - const __fp16* ptr_xcn = weight_xc_N; - while (n2 > 0) - { - size_t vl = vsetvl_e16m8(n2); - - vfloat16m8_t _x = vle16_v_f16m8(ptr_x2, vl); - vfloat16m8_t _xcn = vle16_v_f16m8(ptr_xcn, vl); - vfloat16m1_t _scalar_n = vfmv_s_f_f16m1(vundefined_f16m1(), N, vl); - - _xcn = vfmul_vv_f16m8(_x, _xcn, vl); - _scalar_n = vfredusum_vs_f16m8_f16m1(_scalar_n, _xcn, _scalar_n, vl); - N = vfmv_f_s_f16m1_f16(_scalar_n); - - n2 -= vl; - ptr_x2 += vl; - ptr_xcn += vl; - } - - // tanh(N) - N = (__fp16)tanh((float)N); - - gates_data[0] = U; - gates_data[1] = N; - } - - // h_t := (1 - update) .* new + update .* h_{t-1} - __fp16* output_data = top_blob.row<__fp16>(ti); - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < num_output; q++) - { - const float* gates_data = gates.row(q); - - float U = gates_data[0]; - float N = gates_data[1]; - - float H = (1 - U) * N + U * hidden_state[q]; - - hidden_state[q] = H; - output_data[q] = H; - } - } - - return 0; -} - -int GRU_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const -{ - int T = bottom_blob.h; - - int num_directions = direction == 2 ? 2 : 1; - // initial hidden state - Mat hidden(num_output, 4u, opt.workspace_allocator); - if (hidden.empty()) - return -100; - hidden.fill(0.f); - - top_blob.create(num_output * num_directions, T, 2u, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - // Uni directional - if (direction == 0 || direction == 1) - { - int ret = gru_fp16sa(bottom_blob, top_blob, direction, weight_xc_data_fp16sa.channel(0), bias_c_data_fp16sa.channel(0), weight_hc_data_fp16sa.channel(0), hidden, opt); - if (ret != 0) - return ret; - } - - if (direction == 2) - { - Mat top_blob_forward(num_output, T, 2u, opt.workspace_allocator); - if (top_blob_forward.empty()) - return -100; - - Mat top_blob_reverse(num_output, T, 2u, opt.workspace_allocator); - if (top_blob_reverse.empty()) - return -100; - - int ret0 = gru_fp16sa(bottom_blob, top_blob_forward, 0, weight_xc_data_fp16sa.channel(0), bias_c_data_fp16sa.channel(0), weight_hc_data_fp16sa.channel(0), hidden, opt); - if (ret0 != 0) - return ret0; - - hidden.fill(0.0f); - - int ret1 = gru_fp16sa(bottom_blob, top_blob_reverse, 1, weight_xc_data_fp16sa.channel(1), bias_c_data_fp16sa.channel(1), weight_hc_data_fp16sa.channel(1), hidden, opt); - if (ret1 != 0) - return ret1; - - // concat w - for (int i = 0; i < T; i++) - { - const __fp16* pf = top_blob_forward.row(i); - const __fp16* pr = top_blob_reverse.row(i); - __fp16* ptr = top_blob.row<__fp16>(i); - - memcpy(ptr, pf, num_output * sizeof(__fp16)); - memcpy(ptr + num_output, pr, num_output * sizeof(__fp16)); - } - } - - return 0; -} - -int GRU_riscv::forward_fp16sa(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const -{ - const Mat& bottom_blob = bottom_blobs[0]; - int T = bottom_blob.h; - int num_directions = direction == 2 ? 2 : 1; - - Mat hidden; - Allocator* hidden_allocator = top_blobs.size() == 2 ? opt.blob_allocator : opt.workspace_allocator; - if (bottom_blobs.size() == 2) - { - Option opt_cast = opt; - opt_cast.blob_allocator = hidden_allocator; - cast_float16_to_float32(bottom_blobs[1], hidden, opt_cast); - } - else - { - hidden.create(num_output, num_directions, 4u, hidden_allocator); - if (hidden.empty()) - return -100; - hidden.fill(0.f); - } - - Mat& top_blob = top_blobs[0]; - top_blob.create(num_output * num_directions, T, 2u, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - // Uni directional - if (direction == 0 || direction == 1) - { - int ret = gru_fp16sa(bottom_blob, top_blob, direction, weight_xc_data_fp16sa.channel(0), bias_c_data_fp16sa.channel(0), weight_hc_data_fp16sa.channel(0), hidden, opt); - if (ret != 0) - return ret; - } - - if (direction == 2) - { - Mat top_blob_forward(num_output, T, 2u, opt.workspace_allocator); - if (top_blob_forward.empty()) - return -100; - - Mat top_blob_reverse(num_output, T, 2u, opt.workspace_allocator); - if (top_blob_reverse.empty()) - return -100; - - Mat hidden0 = hidden.row_range(0, 1); - int ret0 = gru_fp16sa(bottom_blob, top_blob_forward, 0, weight_xc_data_fp16sa.channel(0), bias_c_data_fp16sa.channel(0), weight_hc_data_fp16sa.channel(0), hidden0, opt); - if (ret0 != 0) - return ret0; - - Mat hidden1 = hidden.row_range(1, 1); - int ret1 = gru_fp16sa(bottom_blob, top_blob_reverse, 1, weight_xc_data_fp16sa.channel(1), bias_c_data_fp16sa.channel(1), weight_hc_data_fp16sa.channel(1), hidden1, opt); - if (ret1 != 0) - return ret1; - - // concat w - for (int i = 0; i < T; i++) - { - const __fp16* pf = top_blob_forward.row(i); - const __fp16* pr = top_blob_reverse.row(i); - __fp16* ptr = top_blob.row<__fp16>(i); - - memcpy(ptr, pf, num_output * sizeof(__fp16)); - memcpy(ptr + num_output, pr, num_output * sizeof(__fp16)); - } - } - - if (top_blobs.size() == 2) - { - cast_float32_to_float16(hidden, top_blobs[1], opt); - } - - return 0; -} - -#endif - } // namespace ncnn diff --git a/src/layer/riscv/gru_riscv.h b/src/layer/riscv/gru_riscv.h index 46bb624519f..a9434f83083 100644 --- a/src/layer/riscv/gru_riscv.h +++ b/src/layer/riscv/gru_riscv.h @@ -29,7 +29,7 @@ class GRU_riscv : public GRU virtual int create_pipeline(const Option& opt); protected: -#if __riscv_vector && __riscv_zfh +#if NCNN_ZFH int forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; int forward_fp16s(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; diff --git a/src/layer/riscv/gru_riscv_zfh.cpp b/src/layer/riscv/gru_riscv_zfh.cpp new file mode 100644 index 00000000000..7ffe9723629 --- /dev/null +++ b/src/layer/riscv/gru_riscv_zfh.cpp @@ -0,0 +1,738 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "gru_riscv.h" + +#if __riscv_vector +#include +#endif // __riscv_vector + +namespace ncnn { + +#if NCNN_ZFH +static int gru_fp16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& weight_xc, const Mat& bias_c, const Mat& weight_hc, Mat& hidden_state, const Option& opt) +{ + int size = bottom_blob.w; + int T = bottom_blob.h; + + int num_output = top_blob.w; + + // 2 x num_output + Mat gates(2, num_output, 4u, opt.workspace_allocator); + if (gates.empty()) + return -100; + + // unroll + for (int t = 0; t < T; t++) + { + int ti = reverse ? T - 1 - t : t; + + const __fp16* x = bottom_blob.row(ti); + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < num_output; q++) + { + float* gates_data = gates.row(q); + + // gate reset update + const float* bias_c_R = bias_c.row(0); + const float* bias_c_U = bias_c.row(1); + + const float* weight_xc_R = weight_xc.row(num_output * 0 + q); + const float* weight_xc_U = weight_xc.row(num_output * 1 + q); + const float* weight_hc_R = weight_hc.row(num_output * 0 + q); + const float* weight_hc_U = weight_hc.row(num_output * 1 + q); + + float R = bias_c_R[q]; + float U = bias_c_U[q]; + +#if __riscv_zvfh + const __fp16* ptr_x = x; + const float* ptr_xcr = weight_xc_R; + const float* ptr_xcu = weight_xc_U; + int n = size; + while (n > 0) + { + size_t vl = __riscv_vsetvl_e16m4(n); + vfloat32m8_t _x = __riscv_vfwcvt_f_f_v_f32m8(__riscv_vle16_v_f16m4(ptr_x, vl), vl); + vfloat32m8_t _xcr = __riscv_vle32_v_f32m8(ptr_xcr, vl); + vfloat32m8_t _xcu = __riscv_vle32_v_f32m8(ptr_xcu, vl); + vfloat32m1_t _scalar_r = __riscv_vfmv_s_f_f32m1(R, vl); + vfloat32m1_t _scalar_u = __riscv_vfmv_s_f_f32m1(U, vl); + + _xcr = __riscv_vfmul_vv_f32m8(_xcr, _x, vl); + _xcu = __riscv_vfmul_vv_f32m8(_xcu, _x, vl); + _scalar_r = __riscv_vfredusum_vs_f32m8_f32m1(_xcr, _scalar_r, vl); + _scalar_u = __riscv_vfredusum_vs_f32m8_f32m1(_xcu, _scalar_u, vl); + + R = __riscv_vfmv_f_s_f32m1_f32(_scalar_r); + U = __riscv_vfmv_f_s_f32m1_f32(_scalar_u); + + ptr_x += vl; + ptr_xcr += vl; + ptr_xcu += vl; + n -= vl; + } + ptr_x = NULL; + ptr_xcr = NULL; + ptr_xcu = NULL; +#else // __riscv_zvfh + for (int i = 0; i < size; i++) + { + float xi = x[i]; + + R += weight_xc_R[i] * xi; + U += weight_xc_U[i] * xi; + } +#endif // __riscv_zvfh + +#if __riscv_zvfh + const float* ptr_hc = hidden_state; + const float* ptr_hcr = weight_hc_R; + const float* ptr_hcu = weight_hc_U; + int n_out = num_output; + while (n_out > 0) + { + size_t vl = __riscv_vsetvl_e16m4(n_out); + vfloat32m8_t _h_cont = __riscv_vle32_v_f32m8(ptr_hc, vl); + vfloat32m8_t _hcr = __riscv_vle32_v_f32m8(ptr_hcr, vl); + vfloat32m8_t _hcu = __riscv_vle32_v_f32m8(ptr_hcu, vl); + vfloat32m1_t _scalar_r = __riscv_vfmv_s_f_f32m1(R, vl); + vfloat32m1_t _scalar_u = __riscv_vfmv_s_f_f32m1(U, vl); + + _hcr = __riscv_vfmul_vv_f32m8(_hcr, _h_cont, vl); + _hcu = __riscv_vfmul_vv_f32m8(_hcu, _h_cont, vl); + _scalar_r = __riscv_vfredusum_vs_f32m8_f32m1(_hcr, _scalar_r, vl); + _scalar_u = __riscv_vfredusum_vs_f32m8_f32m1(_hcu, _scalar_u, vl); + + R = __riscv_vfmv_f_s_f32m1_f32(_scalar_r); + U = __riscv_vfmv_f_s_f32m1_f32(_scalar_u); + + ptr_hc += vl; + ptr_hcr += vl; + ptr_hcu += vl; + n_out -= vl; + } + ptr_hc = NULL; + ptr_hcr = NULL; + ptr_hcu = NULL; +#else // __riscv_zvfh + for (int i = 0; i < num_output; i++) + { + float h_cont = hidden_state[i]; + + R += weight_hc_R[i] * h_cont; + U += weight_hc_U[i] * h_cont; + } +#endif // __riscv_zvfh + + // sigmoid(R) + // sigmoid(U) + R = 1.f / (1.f + exp(-R)); + U = 1.f / (1.f + exp(-U)); + + // gate new + const float* bias_c_WN = bias_c.row(2); + const float* bias_c_BN = bias_c.row(3); + + const float* weight_xc_N = weight_xc.row(num_output * 2 + q); + const float* weight_hc_N = weight_hc.row(num_output * 2 + q); + + float N = bias_c_BN[q]; + +#if __riscv_zvfh + const float* ptr_hc2 = hidden_state; + const float* ptr_whc_n = weight_hc_N; + int n_out2 = num_output; + while (n_out2 > 0) + { + size_t vl = __riscv_vsetvl_e16m4(n_out2); + + vfloat32m8_t _h_cont = __riscv_vle32_v_f32m8(ptr_hc2, vl); + vfloat32m8_t _whc_n = __riscv_vle32_v_f32m8(ptr_whc_n, vl); + vfloat32m1_t _scalar_n = __riscv_vfmv_s_f_f32m1(N, vl); + + _h_cont = __riscv_vfmul_vv_f32m8(_whc_n, _h_cont, vl); + _scalar_n = __riscv_vfredusum_vs_f32m8_f32m1(_h_cont, _scalar_n, vl); + + N = __riscv_vfmv_f_s_f32m1_f32(_scalar_n); + n_out2 -= vl; + ptr_hc2 += vl; + ptr_whc_n += vl; + } + ptr_hc2 = NULL; + ptr_whc_n = NULL; +#else // __riscv_zvfh + for (int i = 0; i < num_output; i++) + { + float h_cont = hidden_state[i]; + + N += weight_hc_N[i] * h_cont; + } +#endif // __riscv_zvfh + + N = bias_c_WN[q] + R * N; + +#if __riscv_zvfh + const __fp16* ptr_x2 = x; + const float* ptr_xcn = weight_xc_N; + int n2 = size; + while (n2 > 0) + { + size_t vl = __riscv_vsetvl_e16m4(n2); + + vfloat32m8_t _x = __riscv_vfwcvt_f_f_v_f32m8(__riscv_vle16_v_f16m4(ptr_x2, vl), vl); + vfloat32m8_t _xcn = __riscv_vle32_v_f32m8(ptr_xcn, vl); + vfloat32m1_t _scalar_n = __riscv_vfmv_s_f_f32m1(N, vl); + + _xcn = __riscv_vfmul_vv_f32m8(_x, _xcn, vl); + _scalar_n = __riscv_vfredusum_vs_f32m8_f32m1(_xcn, _scalar_n, vl); + N = __riscv_vfmv_f_s_f32m1_f32(_scalar_n); + + n2 -= vl; + ptr_x2 += vl; + ptr_xcn += vl; + } + ptr_x2 = NULL; + ptr_xcn = NULL; +#else // __riscv_zvfh + for (int i = 0; i < size; i++) + { + float xi = x[i]; + + N += weight_xc_N[i] * xi; + } +#endif // __riscv_zvfh + + // tanh(N) + N = tanh(N); + + gates_data[0] = U; + gates_data[1] = N; + } + + // h_t := (1 - update) .* new + update .* h_{t-1} + __fp16* output_data = top_blob.row<__fp16>(ti); + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < num_output; q++) + { + const float* gates_data = gates.row(q); + + float U = gates_data[0]; + float N = gates_data[1]; + + float H = (1 - U) * N + U * hidden_state[q]; + + hidden_state[q] = H; + output_data[q] = (__fp16)H; + } + } + + return 0; +} + +int GRU_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ + int T = bottom_blob.h; + + int num_directions = direction == 2 ? 2 : 1; + // initial hidden state + Mat hidden(num_output, 4u, opt.workspace_allocator); + if (hidden.empty()) + return -100; + hidden.fill(0.f); + + top_blob.create(num_output * num_directions, T, 2u, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + // Uni directional + if (direction == 0 || direction == 1) + { + int ret = gru_fp16s(bottom_blob, top_blob, direction, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), hidden, opt); + if (ret != 0) + return ret; + } + + if (direction == 2) + { + Mat top_blob_forward(num_output, T, 2u, opt.workspace_allocator); + if (top_blob_forward.empty()) + return -100; + + Mat top_blob_reverse(num_output, T, 2u, opt.workspace_allocator); + if (top_blob_reverse.empty()) + return -100; + + int ret0 = gru_fp16s(bottom_blob, top_blob_forward, 0, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), hidden, opt); + if (ret0 != 0) + return ret0; + + hidden.fill(0.0f); + + int ret1 = gru_fp16s(bottom_blob, top_blob_reverse, 1, weight_xc_data.channel(1), bias_c_data.channel(1), weight_hc_data.channel(1), hidden, opt); + if (ret1 != 0) + return ret1; + + // concat w + for (int i = 0; i < T; i++) + { + const __fp16* pf = top_blob_forward.row(i); + const __fp16* pr = top_blob_reverse.row(i); + __fp16* ptr = top_blob.row<__fp16>(i); + + memcpy(ptr, pf, num_output * sizeof(__fp16)); + memcpy(ptr + num_output, pr, num_output * sizeof(__fp16)); + } + } + + return 0; +} + +int GRU_riscv::forward_fp16s(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const +{ + const Mat& bottom_blob = bottom_blobs[0]; + int T = bottom_blob.h; + int num_directions = direction == 2 ? 2 : 1; + + Mat hidden; + Allocator* hidden_allocator = top_blobs.size() == 2 ? opt.blob_allocator : opt.workspace_allocator; + if (bottom_blobs.size() == 2) + { + Option opt_cast = opt; + opt_cast.blob_allocator = hidden_allocator; + cast_float16_to_float32(bottom_blobs[1], hidden, opt_cast); + } + else + { + hidden.create(num_output, num_directions, 4u, hidden_allocator); + if (hidden.empty()) + return -100; + hidden.fill(0.f); + } + + Mat& top_blob = top_blobs[0]; + top_blob.create(num_output * num_directions, T, 2u, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + // Uni directional + if (direction == 0 || direction == 1) + { + Mat hidden0 = hidden.row_range(0, 1); + int ret = gru_fp16s(bottom_blob, top_blob, direction, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), hidden0, opt); + if (ret != 0) + return ret; + } + + if (direction == 2) + { + Mat top_blob_forward(num_output, T, 2u, opt.workspace_allocator); + if (top_blob_forward.empty()) + return -100; + + Mat top_blob_reverse(num_output, T, 2u, opt.workspace_allocator); + if (top_blob_reverse.empty()) + return -100; + + Mat hidden0 = hidden.row_range(0, 1); + int ret0 = gru_fp16s(bottom_blob, top_blob_forward, 0, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), hidden0, opt); + if (ret0 != 0) + return ret0; + + Mat hidden1 = hidden.row_range(1, 1); + int ret1 = gru_fp16s(bottom_blob, top_blob_reverse, 1, weight_xc_data.channel(1), bias_c_data.channel(1), weight_hc_data.channel(1), hidden1, opt); + if (ret1 != 0) + return ret1; + + // concat w + for (int i = 0; i < T; i++) + { + const __fp16* pf = top_blob_forward.row(i); + const __fp16* pr = top_blob_reverse.row(i); + __fp16* ptr = top_blob.row<__fp16>(i); + + memcpy(ptr, pf, num_output * sizeof(__fp16)); + memcpy(ptr + num_output, pr, num_output * sizeof(__fp16)); + } + } + + if (top_blobs.size() == 2) + { + cast_float32_to_float16(hidden, top_blobs[1], opt); + } + + return 0; +} + +int GRU_riscv::create_pipeline_fp16sa(const Option& opt) +{ + cast_float32_to_float16(weight_xc_data, weight_xc_data_fp16sa, opt); + cast_float32_to_float16(weight_hc_data, weight_hc_data_fp16sa, opt); + cast_float32_to_float16(bias_c_data, bias_c_data_fp16sa, opt); + + if (opt.lightmode) + { + weight_xc_data.release(); + bias_c_data.release(); + weight_hc_data.release(); + } + + return 0; +} + +static int gru_fp16sa(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& weight_xc, const Mat& bias_c, const Mat& weight_hc, Mat& hidden_state, const Option& opt) +{ + int size = bottom_blob.w; + int T = bottom_blob.h; + + int num_output = top_blob.w; + + // 2 x num_output + Mat gates(2, num_output, 4u, opt.workspace_allocator); + if (gates.empty()) + return -100; + + // unroll + for (int t = 0; t < T; t++) + { + int ti = reverse ? T - 1 - t : t; + + const __fp16* x = bottom_blob.row(ti); + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < num_output; q++) + { + float* gates_data = gates.row(q); + + // gate reset update + const __fp16* bias_c_R = bias_c.row(0); + const __fp16* bias_c_U = bias_c.row(1); + + const __fp16* weight_xc_R = weight_xc.row(num_output * 0 + q); + const __fp16* weight_xc_U = weight_xc.row(num_output * 1 + q); + const __fp16* weight_hc_R = weight_hc.row(num_output * 0 + q); + const __fp16* weight_hc_U = weight_hc.row(num_output * 1 + q); + + __fp16 R = bias_c_R[q]; + __fp16 U = bias_c_U[q]; + +#if __riscv_zvfh + const __fp16* ptr_x = x; + const __fp16* ptr_xcr = weight_xc_R; + const __fp16* ptr_xcu = weight_xc_U; + int n = size; + while (n > 0) + { + size_t vl = __riscv_vsetvl_e16m8(n); + vfloat16m8_t _x = __riscv_vle16_v_f16m8(ptr_x, vl); + vfloat16m8_t _xcr = __riscv_vle16_v_f16m8(ptr_xcr, vl); + vfloat16m8_t _xcu = __riscv_vle16_v_f16m8(ptr_xcu, vl); + vfloat16m1_t _scalar_r = __riscv_vfmv_s_f_f16m1(R, vl); + vfloat16m1_t _scalar_u = __riscv_vfmv_s_f_f16m1(U, vl); + + _xcr = __riscv_vfmul_vv_f16m8(_xcr, _x, vl); + _xcu = __riscv_vfmul_vv_f16m8(_xcu, _x, vl); + _scalar_r = __riscv_vfredusum_vs_f16m8_f16m1(_xcr, _scalar_r, vl); + _scalar_u = __riscv_vfredusum_vs_f16m8_f16m1(_xcu, _scalar_u, vl); + + R = __riscv_vfmv_f_s_f16m1_f16(_scalar_r); + U = __riscv_vfmv_f_s_f16m1_f16(_scalar_u); + + ptr_x += vl; + ptr_xcr += vl; + ptr_xcu += vl; + n -= vl; + } +#else // __riscv_zvfh + for (int i = 0; i < size; i++) + { + float xi = x[i]; + + R += weight_xc_R[i] * xi; + U += weight_xc_U[i] * xi; + } +#endif // __riscv_zvfh + +#if __riscv_zvfh + const float* ptr_hc = hidden_state; + const __fp16* ptr_hcr = weight_hc_R; + const __fp16* ptr_hcu = weight_hc_U; + int n_out = num_output; + while (n_out > 0) + { + size_t vl = __riscv_vsetvl_e16m4(n_out); + vfloat16m4_t _h_cont = __riscv_vfncvt_f_f_w_f16m4(__riscv_vle32_v_f32m8(ptr_hc, vl), vl); + vfloat16m4_t _hcr = __riscv_vle16_v_f16m4(ptr_hcr, vl); + vfloat16m4_t _hcu = __riscv_vle16_v_f16m4(ptr_hcu, vl); + vfloat16m1_t _scalar_r = __riscv_vfmv_s_f_f16m1(R, vl); + vfloat16m1_t _scalar_u = __riscv_vfmv_s_f_f16m1(U, vl); + + _hcr = __riscv_vfmul_vv_f16m4(_hcr, _h_cont, vl); + _hcu = __riscv_vfmul_vv_f16m4(_hcu, _h_cont, vl); + _scalar_r = __riscv_vfredusum_vs_f16m4_f16m1(_hcr, _scalar_r, vl); + _scalar_u = __riscv_vfredusum_vs_f16m4_f16m1(_hcu, _scalar_u, vl); + + R = __riscv_vfmv_f_s_f16m1_f16(_scalar_r); + U = __riscv_vfmv_f_s_f16m1_f16(_scalar_u); + + ptr_hc += vl; + ptr_hcr += vl; + ptr_hcu += vl; + n_out -= vl; + } +#else // __riscv_zvfh + for (int i = 0; i < num_output; i++) + { + float h_cont = hidden_state[i]; + + R += weight_hc_R[i] * h_cont; + U += weight_hc_U[i] * h_cont; + } +#endif // __riscv_zvfh + + // sigmoid(R) + // sigmoid(U) + R = 1.f / (1.f + (__fp16)exp((float)(-R))); + U = 1.f / (1.f + (__fp16)exp((float)(-U))); + + // gate new + const __fp16* bias_c_WN = bias_c.row(2); + const __fp16* bias_c_BN = bias_c.row(3); + + const __fp16* weight_xc_N = weight_xc.row(num_output * 2 + q); + const __fp16* weight_hc_N = weight_hc.row(num_output * 2 + q); + + __fp16 N = bias_c_BN[q]; + +#if __riscv_zvfh + const float* ptr_hc2 = hidden_state; + const __fp16* ptr_whc_n = weight_hc_N; + int n_out2 = num_output; + while (n_out2 > 0) + { + size_t vl = __riscv_vsetvl_e16m4(n_out2); + + vfloat16m4_t _h_cont = __riscv_vfncvt_f_f_w_f16m4(__riscv_vle32_v_f32m8(ptr_hc2, vl), vl); + vfloat16m4_t _whc_n = __riscv_vle16_v_f16m4(ptr_whc_n, vl); + vfloat16m1_t _scalar_n = __riscv_vfmv_s_f_f16m1(N, vl); + + _h_cont = __riscv_vfmul_vv_f16m4(_whc_n, _h_cont, vl); + _scalar_n = __riscv_vfredusum_vs_f16m4_f16m1(_h_cont, _scalar_n, vl); + + N = __riscv_vfmv_f_s_f16m1_f16(_scalar_n); + n_out2 -= vl; + ptr_hc2 += vl; + ptr_whc_n += vl; + } +#else // __riscv_zvfh + for (int i = 0; i < num_output; i++) + { + float h_cont = hidden_state[i]; + + N += weight_hc_N[i] * h_cont; + } +#endif // __riscv_zvfh + + N = bias_c_WN[q] + R * N; + +#if __riscv_zvfh + const __fp16* ptr_x2 = x; + const __fp16* ptr_xcn = weight_xc_N; + int n2 = size; + while (n2 > 0) + { + size_t vl = __riscv_vsetvl_e16m8(n2); + + vfloat16m8_t _x = __riscv_vle16_v_f16m8(ptr_x2, vl); + vfloat16m8_t _xcn = __riscv_vle16_v_f16m8(ptr_xcn, vl); + vfloat16m1_t _scalar_n = __riscv_vfmv_s_f_f16m1(N, vl); + + _xcn = __riscv_vfmul_vv_f16m8(_x, _xcn, vl); + _scalar_n = __riscv_vfredusum_vs_f16m8_f16m1(_xcn, _scalar_n, vl); + N = __riscv_vfmv_f_s_f16m1_f16(_scalar_n); + + n2 -= vl; + ptr_x2 += vl; + ptr_xcn += vl; + } +#else // __riscv_zvfh + for (int i = 0; i < size; i++) + { + float xi = x[i]; + + N += weight_xc_N[i] * xi; + } +#endif // __riscv_zvfh + + // tanh(N) + N = (__fp16)tanh((float)N); + + gates_data[0] = U; + gates_data[1] = N; + } + + // h_t := (1 - update) .* new + update .* h_{t-1} + __fp16* output_data = top_blob.row<__fp16>(ti); + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < num_output; q++) + { + const float* gates_data = gates.row(q); + + float U = gates_data[0]; + float N = gates_data[1]; + + float H = (1 - U) * N + U * hidden_state[q]; + + hidden_state[q] = H; + output_data[q] = H; + } + } + + return 0; +} + +int GRU_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ + int T = bottom_blob.h; + + int num_directions = direction == 2 ? 2 : 1; + // initial hidden state + Mat hidden(num_output, 4u, opt.workspace_allocator); + if (hidden.empty()) + return -100; + hidden.fill(0.f); + + top_blob.create(num_output * num_directions, T, 2u, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + // Uni directional + if (direction == 0 || direction == 1) + { + int ret = gru_fp16sa(bottom_blob, top_blob, direction, weight_xc_data_fp16sa.channel(0), bias_c_data_fp16sa.channel(0), weight_hc_data_fp16sa.channel(0), hidden, opt); + if (ret != 0) + return ret; + } + + if (direction == 2) + { + Mat top_blob_forward(num_output, T, 2u, opt.workspace_allocator); + if (top_blob_forward.empty()) + return -100; + + Mat top_blob_reverse(num_output, T, 2u, opt.workspace_allocator); + if (top_blob_reverse.empty()) + return -100; + + int ret0 = gru_fp16sa(bottom_blob, top_blob_forward, 0, weight_xc_data_fp16sa.channel(0), bias_c_data_fp16sa.channel(0), weight_hc_data_fp16sa.channel(0), hidden, opt); + if (ret0 != 0) + return ret0; + + hidden.fill(0.0f); + + int ret1 = gru_fp16sa(bottom_blob, top_blob_reverse, 1, weight_xc_data_fp16sa.channel(1), bias_c_data_fp16sa.channel(1), weight_hc_data_fp16sa.channel(1), hidden, opt); + if (ret1 != 0) + return ret1; + + // concat w + for (int i = 0; i < T; i++) + { + const __fp16* pf = top_blob_forward.row(i); + const __fp16* pr = top_blob_reverse.row(i); + __fp16* ptr = top_blob.row<__fp16>(i); + + memcpy(ptr, pf, num_output * sizeof(__fp16)); + memcpy(ptr + num_output, pr, num_output * sizeof(__fp16)); + } + } + + return 0; +} + +int GRU_riscv::forward_fp16sa(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const +{ + const Mat& bottom_blob = bottom_blobs[0]; + int T = bottom_blob.h; + int num_directions = direction == 2 ? 2 : 1; + + Mat hidden; + Allocator* hidden_allocator = top_blobs.size() == 2 ? opt.blob_allocator : opt.workspace_allocator; + if (bottom_blobs.size() == 2) + { + Option opt_cast = opt; + opt_cast.blob_allocator = hidden_allocator; + cast_float16_to_float32(bottom_blobs[1], hidden, opt_cast); + } + else + { + hidden.create(num_output, num_directions, 4u, hidden_allocator); + if (hidden.empty()) + return -100; + hidden.fill(0.f); + } + + Mat& top_blob = top_blobs[0]; + top_blob.create(num_output * num_directions, T, 2u, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + // Uni directional + if (direction == 0 || direction == 1) + { + int ret = gru_fp16sa(bottom_blob, top_blob, direction, weight_xc_data_fp16sa.channel(0), bias_c_data_fp16sa.channel(0), weight_hc_data_fp16sa.channel(0), hidden, opt); + if (ret != 0) + return ret; + } + + if (direction == 2) + { + Mat top_blob_forward(num_output, T, 2u, opt.workspace_allocator); + if (top_blob_forward.empty()) + return -100; + + Mat top_blob_reverse(num_output, T, 2u, opt.workspace_allocator); + if (top_blob_reverse.empty()) + return -100; + + Mat hidden0 = hidden.row_range(0, 1); + int ret0 = gru_fp16sa(bottom_blob, top_blob_forward, 0, weight_xc_data_fp16sa.channel(0), bias_c_data_fp16sa.channel(0), weight_hc_data_fp16sa.channel(0), hidden0, opt); + if (ret0 != 0) + return ret0; + + Mat hidden1 = hidden.row_range(1, 1); + int ret1 = gru_fp16sa(bottom_blob, top_blob_reverse, 1, weight_xc_data_fp16sa.channel(1), bias_c_data_fp16sa.channel(1), weight_hc_data_fp16sa.channel(1), hidden1, opt); + if (ret1 != 0) + return ret1; + + // concat w + for (int i = 0; i < T; i++) + { + const __fp16* pf = top_blob_forward.row(i); + const __fp16* pr = top_blob_reverse.row(i); + __fp16* ptr = top_blob.row<__fp16>(i); + + memcpy(ptr, pf, num_output * sizeof(__fp16)); + memcpy(ptr + num_output, pr, num_output * sizeof(__fp16)); + } + } + + if (top_blobs.size() == 2) + { + cast_float32_to_float16(hidden, top_blobs[1], opt); + } + + return 0; +} +#endif // NCNN_ZFH + +} // namespace ncnn diff --git a/src/layer/riscv/hardsigmoid_riscv.cpp b/src/layer/riscv/hardsigmoid_riscv.cpp index 112a1c9c8d2..370f623e9d8 100644 --- a/src/layer/riscv/hardsigmoid_riscv.cpp +++ b/src/layer/riscv/hardsigmoid_riscv.cpp @@ -21,21 +21,27 @@ #include #endif // __riscv_vector +#include "cpu.h" + namespace ncnn { HardSigmoid_riscv::HardSigmoid_riscv() { #if __riscv_vector support_packing = true; -#if __riscv_zfh - support_fp16_storage = true; -#endif #endif // __riscv_vector +#if NCNN_ZFH +#if __riscv_vector + support_fp16_storage = cpu_support_riscv_zvfh(); +#else + support_fp16_storage = cpu_support_riscv_zfh(); +#endif +#endif } int HardSigmoid_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const { -#if __riscv_vector && __riscv_zfh +#if NCNN_ZFH int elembits = bottom_top_blob.elembits(); if (opt.use_fp16_storage && elembits == 16) @@ -60,20 +66,18 @@ int HardSigmoid_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) int n = size; while (n > 0) { - size_t vl = vsetvl_e32m8(n); - vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); + size_t vl = __riscv_vsetvl_e32m8(n); + vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr, vl); - vbool4_t _lower = vmflt_vf_f32m8_b4(_p, lower, vl); - vbool4_t _higher = vmfgt_vf_f32m8_b4(_p, upper, vl); - vbool4_t _apply = vmnor_mm_b4(_lower, _higher, vl); - _p = vfmerge_vfm_f32m8(_lower, _p, .0f, vl); - _p = vfmerge_vfm_f32m8(_higher, _p, 1.f, vl); + vbool4_t _lower = __riscv_vmflt_vf_f32m8_b4(_p, lower, vl); + vbool4_t _higher = __riscv_vmfgt_vf_f32m8_b4(_p, upper, vl); + vbool4_t _apply = __riscv_vmnor_mm_b4(_lower, _higher, vl); + _p = __riscv_vfmerge_vfm_f32m8(_p, .0f, _lower, vl); + _p = __riscv_vfmerge_vfm_f32m8(_p, 1.f, _higher, vl); - _p = vfadd_vf_f32m8_m(_apply, _p, - /*op1*/ vfmul_vf_f32m8_m(_apply, _p, _p, alpha, vl), - beta, vl); + _p = __riscv_vfadd_vf_f32m8_mu(_apply, _p, __riscv_vfmul_vf_f32m8_m(_apply, _p, alpha, vl), beta, vl); - vse32_v_f32m8(ptr, _p, vl); + __riscv_vse32_v_f32m8(ptr, _p, vl); ptr += vl; n -= vl; } @@ -93,45 +97,4 @@ int HardSigmoid_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) return 0; } -#if __riscv_vector && __riscv_zfh -int HardSigmoid_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const -{ - int w = bottom_top_blob.w; - int h = bottom_top_blob.h; - int d = bottom_top_blob.d; - int channels = bottom_top_blob.c; - int elempack = bottom_top_blob.elempack; - int size = w * h * d * elempack; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - __fp16* ptr = bottom_top_blob.channel(q); - - int n = size; - while (n > 0) - { - size_t vl = vsetvl_e16m8(n); - vfloat16m8_t _p = vle16_v_f16m8(ptr, vl); - - vbool2_t _lower = vmflt_vf_f16m8_b2(_p, lower, vl); - vbool2_t _higher = vmfgt_vf_f16m8_b2(_p, upper, vl); - vbool2_t _apply = vmnor_mm_b2(_lower, _higher, vl); - _p = vfmerge_vfm_f16m8(_lower, _p, .0f, vl); - _p = vfmerge_vfm_f16m8(_higher, _p, 1.f, vl); - - _p = vfadd_vf_f16m8_m( - _apply, _p, - /*op1*/ vfmul_vf_f16m8_m(_apply, _p, /*op1*/ _p, alpha, vl), beta, - vl); - vse16_v_f16m8(ptr, _p, vl); - ptr += vl; - n -= vl; - } - } - - return 0; -} -#endif - } // namespace ncnn diff --git a/src/layer/riscv/hardsigmoid_riscv.h b/src/layer/riscv/hardsigmoid_riscv.h index 3c264b3188e..0883422101c 100644 --- a/src/layer/riscv/hardsigmoid_riscv.h +++ b/src/layer/riscv/hardsigmoid_riscv.h @@ -27,7 +27,7 @@ class HardSigmoid_riscv : public HardSigmoid virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; protected: -#if __riscv_vector && __riscv_zfh +#if NCNN_ZFH int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const; #endif }; diff --git a/src/layer/riscv/hardsigmoid_riscv_zfh.cpp b/src/layer/riscv/hardsigmoid_riscv_zfh.cpp new file mode 100644 index 00000000000..f1f8ab6f618 --- /dev/null +++ b/src/layer/riscv/hardsigmoid_riscv_zfh.cpp @@ -0,0 +1,78 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "hardsigmoid_riscv.h" + +#if __riscv_vector +#include +#endif // __riscv_vector + +namespace ncnn { + +#if NCNN_ZFH +int HardSigmoid_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const +{ + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + int d = bottom_top_blob.d; + int channels = bottom_top_blob.c; + int elempack = bottom_top_blob.elempack; + int size = w * h * d * elempack; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + __fp16* ptr = bottom_top_blob.channel(q); + + __fp16 _lower = (__fp16)lower; + __fp16 _upper = (__fp16)upper; + __fp16 _alpha = (__fp16)alpha; + __fp16 _beta = (__fp16)beta; + +#if __riscv_zvfh + int n = size; + while (n > 0) + { + size_t vl = __riscv_vsetvl_e16m8(n); + vfloat16m8_t _p = __riscv_vle16_v_f16m8(ptr, vl); + + vbool2_t _is_lower = __riscv_vmflt_vf_f16m8_b2(_p, _lower, vl); + vbool2_t _is_higher = __riscv_vmfgt_vf_f16m8_b2(_p, _upper, vl); + vbool2_t _apply = __riscv_vmnor_mm_b2(_is_lower, _is_higher, vl); + _p = __riscv_vfmerge_vfm_f16m8(_p, (__fp16)0.f, _is_lower, vl); + _p = __riscv_vfmerge_vfm_f16m8(_p, (__fp16)1.f, _is_higher, vl); + + _p = __riscv_vfadd_vf_f16m8_mu(_apply, _p, __riscv_vfmul_vf_f16m8_m(_apply, _p, _alpha, vl), _beta, vl); + __riscv_vse16_v_f16m8(ptr, _p, vl); + ptr += vl; + n -= vl; + } +#else // __riscv_zvfh + for (int i = 0; i < size; i++) + { + if (ptr[i] < _lower) + ptr[i] = (__fp16)0.f; + else if (ptr[i] > _upper) + ptr[i] = (__fp16)1.f; + else + ptr[i] = ptr[i] * _alpha + _beta; + } +#endif // __riscv_zvfh + } + + return 0; +} +#endif // NCNN_ZFH + +} // namespace ncnn diff --git a/src/layer/riscv/hardswish_riscv.cpp b/src/layer/riscv/hardswish_riscv.cpp index 5d68e07b06a..5de9b943a60 100644 --- a/src/layer/riscv/hardswish_riscv.cpp +++ b/src/layer/riscv/hardswish_riscv.cpp @@ -21,21 +21,27 @@ #include #endif // __riscv_vector +#include "cpu.h" + namespace ncnn { HardSwish_riscv::HardSwish_riscv() { #if __riscv_vector support_packing = true; -#if __riscv_zfh - support_fp16_storage = true; -#endif #endif // __riscv_vector +#if NCNN_ZFH +#if __riscv_vector + support_fp16_storage = cpu_support_riscv_zvfh(); +#else + support_fp16_storage = cpu_support_riscv_zfh(); +#endif +#endif } int HardSwish_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const { -#if __riscv_vector && __riscv_zfh +#if NCNN_ZFH int elembits = bottom_top_blob.elembits(); if (opt.use_fp16_storage && elembits == 16) @@ -60,20 +66,18 @@ int HardSwish_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) co int n = size; while (n > 0) { - size_t vl = vsetvl_e32m8(n); - vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); + size_t vl = __riscv_vsetvl_e32m8(n); + vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr, vl); - vbool4_t _lower = vmflt_vf_f32m8_b4(_p, lower, vl); - vbool4_t _higher = vmfgt_vf_f32m8_b4(_p, upper, vl); - vbool4_t _apply = vmnor_mm_b4(_lower, _higher, vl); - _p = vfmerge_vfm_f32m8(_lower, _p, .0f, vl); + vbool4_t _lower = __riscv_vmflt_vf_f32m8_b4(_p, lower, vl); + vbool4_t _higher = __riscv_vmfgt_vf_f32m8_b4(_p, upper, vl); + vbool4_t _apply = __riscv_vmnor_mm_b4(_lower, _higher, vl); + _p = __riscv_vfmerge_vfm_f32m8(_p, .0f, _lower, vl); - vfloat32m8_t _p0 = vfadd_vf_f32m8_m( - _apply, _p, /*op1*/ vfmul_vf_f32m8_m(_apply, _p, _p, alpha, vl), beta, - vl); - _p = vfmul_vv_f32m8_m(_apply, _p, /*op1*/ _p, _p0, vl); + vfloat32m8_t _p0 = __riscv_vfadd_vf_f32m8_m(_apply, __riscv_vfmul_vf_f32m8_m(_apply, _p, alpha, vl), beta, vl); + _p = __riscv_vfmul_vv_f32m8_mu(_apply, _p, _p, _p0, vl); - vse32_v_f32m8(ptr, _p, vl); + __riscv_vse32_v_f32m8(ptr, _p, vl); ptr += vl; n -= vl; } @@ -93,45 +97,4 @@ int HardSwish_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) co return 0; } -#if __riscv_vector && __riscv_zfh -int HardSwish_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const -{ - int w = bottom_top_blob.w; - int h = bottom_top_blob.h; - int d = bottom_top_blob.d; - int channels = bottom_top_blob.c; - int elempack = bottom_top_blob.elempack; - int size = w * h * d * elempack; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - __fp16* ptr = bottom_top_blob.channel(q); - - int n = size; - while (n > 0) - { - size_t vl = vsetvl_e16m8(n); - vfloat16m8_t _p = vle16_v_f16m8(ptr, vl); - - vbool2_t _lower = vmflt_vf_f16m8_b2(_p, lower, vl); - vbool2_t _higher = vmfgt_vf_f16m8_b2(_p, upper, vl); - vbool2_t _apply = vmnor_mm_b2(_lower, _higher, vl); - _p = vfmerge_vfm_f16m8(_lower, _p, .0f, vl); - - vfloat16m8_t _p0 = vfadd_vf_f16m8_m( - _apply, _p, /*op1*/ vfmul_vf_f16m8_m(_apply, _p, _p, alpha, vl), beta, - vl); - _p = vfmul_vv_f16m8_m(_apply, _p, /*op1*/ _p, _p0, vl); - - vse16_v_f16m8(ptr, _p, vl); - ptr += vl; - n -= vl; - } - } - - return 0; -} -#endif - } // namespace ncnn diff --git a/src/layer/riscv/hardswish_riscv.h b/src/layer/riscv/hardswish_riscv.h index cfec7916f59..b882487ba21 100644 --- a/src/layer/riscv/hardswish_riscv.h +++ b/src/layer/riscv/hardswish_riscv.h @@ -30,7 +30,7 @@ class HardSwish_riscv : public HardSwish virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; protected: -#if __riscv_vector && __riscv_zfh +#if NCNN_ZFH int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const; #endif }; diff --git a/src/layer/riscv/hardswish_riscv_zfh.cpp b/src/layer/riscv/hardswish_riscv_zfh.cpp new file mode 100644 index 00000000000..2afdf07d9dd --- /dev/null +++ b/src/layer/riscv/hardswish_riscv_zfh.cpp @@ -0,0 +1,79 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "hardswish_riscv.h" + +#if __riscv_vector +#include +#endif // __riscv_vector + +namespace ncnn { + +#if NCNN_ZFH +int HardSwish_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const +{ + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + int d = bottom_top_blob.d; + int channels = bottom_top_blob.c; + int elempack = bottom_top_blob.elempack; + int size = w * h * d * elempack; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + __fp16* ptr = bottom_top_blob.channel(q); + + __fp16 _lower = (__fp16)lower; + __fp16 _upper = (__fp16)upper; + __fp16 _alpha = (__fp16)alpha; + __fp16 _beta = (__fp16)beta; + +#if __riscv_zvfh + int n = size; + while (n > 0) + { + size_t vl = __riscv_vsetvl_e16m8(n); + vfloat16m8_t _p = __riscv_vle16_v_f16m8(ptr, vl); + + vbool2_t _is_lower = __riscv_vmflt_vf_f16m8_b2(_p, _lower, vl); + vbool2_t _is_higher = __riscv_vmfgt_vf_f16m8_b2(_p, _upper, vl); + vbool2_t _apply = __riscv_vmnor_mm_b2(_is_lower, _is_higher, vl); + _p = __riscv_vfmerge_vfm_f16m8(_p, (__fp16).0f, _is_lower, vl); + + vfloat16m8_t _p0 = __riscv_vfadd_vf_f16m8_m(_apply, __riscv_vfmul_vf_f16m8_m(_apply, _p, _alpha, vl), _beta, vl); + _p = __riscv_vfmul_vv_f16m8_mu(_apply, _p, _p, _p0, vl); + + __riscv_vse16_v_f16m8(ptr, _p, vl); + ptr += vl; + n -= vl; + } +#else // __riscv_zvfh + for (int i = 0; i < size; i++) + { + if (ptr[i] < _lower) + ptr[i] = (__fp16)0.f; + else if (ptr[i] > _upper) + ; + else + ptr[i] = ptr[i] * (ptr[i] * _alpha + _beta); + } +#endif // __riscv_zvfh + } + + return 0; +} +#endif // NCNN_ZFH + +} // namespace ncnn diff --git a/src/layer/riscv/innerproduct_riscv.cpp b/src/layer/riscv/innerproduct_riscv.cpp index e71c09a157c..8b113b3ea57 100644 --- a/src/layer/riscv/innerproduct_riscv.cpp +++ b/src/layer/riscv/innerproduct_riscv.cpp @@ -19,20 +19,25 @@ #if __riscv_vector #include #endif // __riscv_vector - #include "riscv_activation.h" #include "riscv_usability.h" +#include "cpu.h" + namespace ncnn { InnerProduct_riscv::InnerProduct_riscv() { #if __riscv_vector support_packing = true; -#if __riscv_zfh - support_fp16_storage = true; -#endif #endif // __riscv_vector +#if NCNN_ZFH +#if __riscv_vector + support_fp16_storage = cpu_support_riscv_zvfh(); +#else + support_fp16_storage = cpu_support_riscv_zfh(); +#endif +#endif flatten = 0; } @@ -57,8 +62,8 @@ int InnerProduct_riscv::create_pipeline(const Option& opt) } #endif -#if __riscv_vector && __riscv_zfh - if (opt.use_fp16_storage) +#if NCNN_ZFH + if (support_fp16_storage && opt.use_fp16_storage) { return create_pipeline_fp16s(opt); } @@ -153,9 +158,9 @@ int InnerProduct_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opt } #endif +#if NCNN_ZFH int elembits = bottom_blob.elembits(); -#if __riscv_vector && __riscv_zfh if (opt.use_fp16_storage && elembits == 16) { if (opt.use_fp16_arithmetic) @@ -196,7 +201,7 @@ int InnerProduct_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opt #if __riscv_vector if (elempack == packn && num_output_elempack == packn) { - const size_t vl = vsetvl_e32m1(packn); + const size_t vl = __riscv_vsetvl_e32m1(packn); float* outptr = top_blob.row(j); @@ -207,18 +212,18 @@ int InnerProduct_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opt const float* kptr = weight_data_tm.row(p) + l; const float* m = bottom_blob.row(j); - vfloat32m1_t _sum = vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _sum = __riscv_vfmv_v_f_f32m1(0.f, vl); if (bias_term) { - _sum = vfmv_v_f_f32m1(bias_data[p * packn + l], vl); + _sum = __riscv_vfmv_v_f_f32m1(bias_data[p * packn + l], vl); } int n = num_input; while (n > 0) { - vfloat32m1_t _val = vle32_v_f32m1(m, vl); - _sum = vfmacc_vf_f32m1(_sum, *kptr, _val, vl); + vfloat32m1_t _val = __riscv_vle32_v_f32m1(m, vl); + _sum = __riscv_vfmacc_vf_f32m1(_sum, *kptr, _val, vl); m += packn; kptr += packn; @@ -227,7 +232,7 @@ int InnerProduct_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opt _sum = activation_ps(_sum, activation_type, activation_params, vl); - vse32_v_f32m1(outptr, _sum, vl); + __riscv_vse32_v_f32m1(outptr, _sum, vl); outptr += packn; } } @@ -235,7 +240,7 @@ int InnerProduct_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opt if (elempack == 1 && num_output_elempack == packn) { - const size_t vl = vsetvl_e32m1(packn); + const size_t vl = __riscv_vsetvl_e32m1(packn); float* outptr = top_blob.row(j); @@ -244,18 +249,18 @@ int InnerProduct_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opt const float* kptr = weight_data_tm.row(p); const float* m = bottom_blob.row(j); - vfloat32m1_t _sum = vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _sum = __riscv_vfmv_v_f_f32m1(0.f, vl); if (bias_term) { - _sum = vle32_v_f32m1((const float*)bias_data + p * packn, vl); + _sum = __riscv_vle32_v_f32m1((const float*)bias_data + p * packn, vl); } int n = num_input; while (n > 0) { - vfloat32m1_t _w = vle32_v_f32m1(kptr, vl); - _sum = vfmacc_vf_f32m1(_sum, *m, _w, vl); + vfloat32m1_t _w = __riscv_vle32_v_f32m1(kptr, vl); + _sum = __riscv_vfmacc_vf_f32m1(_sum, *m, _w, vl); m += 1; kptr += packn; @@ -264,14 +269,14 @@ int InnerProduct_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opt _sum = activation_ps(_sum, activation_type, activation_params, vl); - vse32_v_f32m1(outptr, _sum, vl); + __riscv_vse32_v_f32m1(outptr, _sum, vl); outptr += packn; } } if (elempack == packn && num_output_elempack == 1) { - const size_t vl = vsetvl_e32m1(packn); + const size_t vl = __riscv_vsetvl_e32m1(packn); float* outptr = top_blob.row(j); @@ -280,18 +285,18 @@ int InnerProduct_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opt const float* kptr = (const float*)weight_data_tm + num_input * p; const float* m = bottom_blob.row(j); - vfloat32m1_t _sum = vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _sum = __riscv_vfmv_v_f_f32m1(0.f, vl); if (bias_term) { - _sum = vfmv_v_f_f32m1(bias_data[p], vl); + _sum = __riscv_vfmv_v_f_f32m1(bias_data[p], vl); } int n = num_input; while (n > 0) { - vfloat32m1_t _val = vle32_v_f32m1(m, vl); - _sum = vfmacc_vf_f32m1(_sum, *kptr, _val, vl); + vfloat32m1_t _val = __riscv_vle32_v_f32m1(m, vl); + _sum = __riscv_vfmacc_vf_f32m1(_sum, *kptr, _val, vl); m += packn; kptr += 1; @@ -300,7 +305,7 @@ int InnerProduct_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opt _sum = activation_ps(_sum, activation_type, activation_params, vl); - vse32_v_f32m1(outptr, _sum, vl); + __riscv_vse32_v_f32m1(outptr, _sum, vl); outptr += packn; } } @@ -370,12 +375,12 @@ int InnerProduct_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opt #pragma omp parallel for num_threads(opt.num_threads) for (int p = 0; p < num_output / out_elempack; p++) { - const size_t vl = vsetvl_e32m1(packn); - vfloat32m1_t _sum = vfmv_v_f_f32m1(0.f, vl); + const size_t vl = __riscv_vsetvl_e32m1(packn); + vfloat32m1_t _sum = __riscv_vfmv_v_f_f32m1(0.f, vl); if (bias_term) { - _sum = vle32_v_f32m1((const float*)bias_data + p * packn, vl); + _sum = __riscv_vle32_v_f32m1((const float*)bias_data + p * packn, vl); } const float* kptr = weight_data_tm.row(p); @@ -385,8 +390,8 @@ int InnerProduct_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opt int n = num_input; while (n > 0) { - vfloat32m1_t _w = vle32_v_f32m1(kptr, vl); - _sum = vfmacc_vf_f32m1(_sum, *sptr, _w, vl); + vfloat32m1_t _w = __riscv_vle32_v_f32m1(kptr, vl); + _sum = __riscv_vfmacc_vf_f32m1(_sum, *sptr, _w, vl); sptr += 1; kptr += packn; @@ -396,7 +401,7 @@ int InnerProduct_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opt _sum = activation_ps(_sum, activation_type, activation_params, vl); float* outptr = top_blob; - vse32_v_f32m1(outptr + p * packn, _sum, vl); + __riscv_vse32_v_f32m1(outptr + p * packn, _sum, vl); } } #endif // __riscv_vector @@ -412,12 +417,12 @@ int InnerProduct_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opt { int p = pp * packn; - const size_t vl = vsetvl_e32m1(packn); - vfloat32m1_t _sum = vfmv_v_f_f32m1(0.f, vl); + const size_t vl = __riscv_vsetvl_e32m1(packn); + vfloat32m1_t _sum = __riscv_vfmv_v_f_f32m1(0.f, vl); if (bias_term) { - _sum = vle32_v_f32m1((const float*)bias_data + p, vl); + _sum = __riscv_vle32_v_f32m1((const float*)bias_data + p, vl); } const float* w = (const float*)weight_data_tm + num_input * p; @@ -427,9 +432,9 @@ int InnerProduct_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opt int n = num_input; while (n > 0) { - vfloat32m1_t _w = vlse32_v_f32m1(w, num_input * sizeof(float), vl); + vfloat32m1_t _w = __riscv_vlse32_v_f32m1(w, num_input * sizeof(float), vl); - _sum = vfmacc_vf_f32m1(_sum, *m, _w, vl); + _sum = __riscv_vfmacc_vf_f32m1(_sum, *m, _w, vl); m += 1; w += 1; @@ -438,7 +443,7 @@ int InnerProduct_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opt _sum = activation_ps(_sum, activation_type, activation_params, vl); - vse32_v_f32m1((float*)top_blob + p, _sum, vl); + __riscv_vse32_v_f32m1((float*)top_blob + p, _sum, vl); } #else // __riscv_vector int nn_num_output = num_output / 4; @@ -524,572 +529,4 @@ int InnerProduct_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opt return 0; } -#if __riscv_vector && __riscv_zfh -int InnerProduct_riscv::create_pipeline_fp16s(const Option& opt) -{ - const int packn = csrr_vlenb() / 2; - - const int num_input = weight_data_size / num_output; - - int out_elempack = 1; - - if (opt.use_packing_layout) - { - out_elempack = num_output % packn == 0 ? packn : 1; - } - - // src = inch-outch - // dst = pb-inch-outch/pb - { - Mat weight_data_r2 = weight_data.reshape(num_input, num_output); - - weight_data_tm.create(num_input, num_output / out_elempack, (size_t)2u * out_elempack, out_elempack); - - for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack) - { - __fp16* g0 = weight_data_tm.row<__fp16>(q / out_elempack); - - for (int p = 0; p < num_input; p++) - { - for (int j = 0; j < out_elempack; j++) - { - *g0++ = (__fp16)(weight_data_r2.row(q + j)[p]); - } - } - } - } - - ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt); - - if (opt.lightmode) - weight_data.release(); - - return 0; -} - -int InnerProduct_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const -{ - const int packn = csrr_vlenb() / 2; - - const int num_input = weight_data_size / num_output; - - if (bottom_blob.dims == 2 && bottom_blob.w == num_input) - { - // gemm - int h = bottom_blob.h; - size_t elemsize = bottom_blob.elemsize; - int elempack = bottom_blob.elempack; - - top_blob.create(num_output, h, elemsize, elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - int num_output_elempack = opt.use_packing_layout && num_output % packn == 0 ? packn : 1; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int j = 0; j < h; j++) - { - if (elempack == packn && num_output_elempack == packn) - { - const size_t vl = vsetvl_e16m1(packn); - - __fp16* outptr = top_blob.row<__fp16>(j); - - for (int p = 0; p < num_output / num_output_elempack; p++) - { - for (int l = 0; l < packn; l++) - { - const __fp16* kptr = (const __fp16*)weight_data_tm + num_input * p * packn + l; - const __fp16* m = bottom_blob.row(j); - - vfloat32m2_t _sum = vfmv_v_f_f32m2(0.f, vl); - - if (bias_term) - { - _sum = vfmv_v_f_f32m2(bias_data[p * packn + l], vl); - } - - int n = num_input; - while (n > 0) - { - vfloat32m2_t _val = vfwcvt_f_f_v_f32m2(vle16_v_f16m1(m, vl), vl); - - _sum = vfmacc_vf_f32m2(_sum, *kptr, _val, vl); - - m += packn; - kptr += packn; - n -= 1; - } - - _sum = activation_ps(_sum, activation_type, activation_params, vl); - - vse16_v_f16m1(outptr, vfncvt_f_f_w_f16m1(_sum, vl), vl); - outptr += packn; - } - } - } - - if (elempack == 1 && num_output_elempack == packn) - { - const size_t vl = vsetvl_e16m1(packn); - - __fp16* outptr = top_blob.row<__fp16>(j); - - for (int p = 0; p < num_output / num_output_elempack; p++) - { - const __fp16* kptr = (const __fp16*)weight_data_tm + num_input * p * packn; - const __fp16* m = bottom_blob.row(j); - - vfloat32m2_t _sum = vfmv_v_f_f32m2(0.f, vl); - - if (bias_term) - { - _sum = vle32_v_f32m2((const float*)bias_data + p * packn, vl); - } - - int n = num_input; - while (n > 0) - { - vfloat32m2_t _w = vfwcvt_f_f_v_f32m2(vle16_v_f16m1(kptr, vl), vl); - - _sum = vfmacc_vf_f32m2(_sum, *m, _w, vl); - - m += 1; - kptr += packn; - n -= 1; - } - - _sum = activation_ps(_sum, activation_type, activation_params, vl); - - vse16_v_f16m1(outptr, vfncvt_f_f_w_f16m1(_sum, vl), vl); - outptr += packn; - } - } - - if (elempack == packn && num_output_elempack == 1) - { - const size_t vl = vsetvl_e16m1(packn); - - __fp16* outptr = top_blob.row<__fp16>(j); - - for (int p = 0; p < num_output; p++) - { - const __fp16* kptr = (const __fp16*)weight_data_tm + num_input * p; - const __fp16* m = bottom_blob.row(j); - - vfloat32m2_t _sum = vfmv_v_f_f32m2(0.f, vl); - - if (bias_term) - { - _sum = vfmv_v_f_f32m2(bias_data[p], vl); - } - - int n = num_input; - while (n > 0) - { - vfloat32m2_t _val = vfwcvt_f_f_v_f32m2(vle16_v_f16m1(m, vl), vl); - - _sum = vfmacc_vf_f32m2(_sum, *kptr, _val, vl); - - m += packn; - kptr += 1; - n -= 1; - } - - _sum = activation_ps(_sum, activation_type, activation_params, vl); - - vse16_v_f16m1(outptr, vfncvt_f_f_w_f16m1(_sum, vl), vl); - outptr += packn; - } - } - - if (elempack == 1 && num_output_elempack == 1) - { - __fp16* outptr = top_blob.row<__fp16>(j); - - for (int p = 0; p < num_output; p++) - { - const __fp16* kptr = (const __fp16*)weight_data_tm + num_input * p; - const __fp16* m = bottom_blob.row(j); - - float sum = 0.f; - - if (bias_term) - { - sum = bias_data[p]; - } - - for (int i = 0; i < num_input; i++) - { - sum += (float)m[i] * (float)kptr[i]; - } - - sum = activation_ss(sum, activation_type, activation_params); - - outptr[0] = (__fp16)sum; - outptr += 1; - } - } - } - - return 0; - } - - // flatten - Mat bottom_blob_flattened = bottom_blob; - if (bottom_blob.dims != 1) - { - Option opt_flatten = opt; - opt_flatten.blob_allocator = opt.workspace_allocator; - - flatten->forward(bottom_blob, bottom_blob_flattened, opt_flatten); - } - - size_t elemsize = bottom_blob_flattened.elemsize; - int elempack = bottom_blob_flattened.elempack; - - int out_elempack = opt.use_packing_layout && num_output % packn == 0 ? packn : 1; - size_t out_elemsize = elemsize / elempack * out_elempack; - - top_blob.create(num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (out_elempack == packn) - { - // num_output - #pragma omp parallel for num_threads(opt.num_threads) - for (int p = 0; p < num_output / out_elempack; p++) - { - const size_t vl = vsetvl_e16m1(packn); - vfloat32m2_t _sum = vfmv_v_f_f32m2(0.f, vl); - - if (bias_term) - { - _sum = vle32_v_f32m2((const float*)bias_data + p * packn, vl); - } - - const __fp16* kptr = weight_data_tm.row(p); - - const __fp16* sptr = bottom_blob_flattened; - - int n = num_input; - while (n > 0) - { - vfloat32m2_t _w = vfwcvt_f_f_v_f32m2(vle16_v_f16m1(kptr, vl), vl); - - _sum = vfmacc_vf_f32m2(_sum, (float)(*sptr), _w, vl); - - sptr += 1; - kptr += packn; - n -= 1; - } - - _sum = activation_ps(_sum, activation_type, activation_params, vl); - - __fp16* outptr = (__fp16*)top_blob; - vse16_v_f16m1(outptr + p * packn, vfncvt_f_f_w_f16m1(_sum, vl), vl); - } - } - - if (out_elempack == 1) - { - // num_output - #pragma omp parallel for num_threads(opt.num_threads) - for (int p = 0; p < num_output; p++) - { - float sum = 0.f; - - if (bias_term) - sum = bias_data[p]; - - const __fp16* kptr = weight_data_tm.row<__fp16>(p); - - const __fp16* sptr = bottom_blob_flattened; - - int i = 0; - for (; i < num_input; i++) - { - float v = (float)(*sptr); - float k = (float)(*kptr); - - sum += v * k; - - sptr++; - kptr++; - } - - sum = activation_ss(sum, activation_type, activation_params); - - __fp16* outptr = (__fp16*)top_blob; - outptr[p] = (__fp16)sum; - } - } - - return 0; -} - -int InnerProduct_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const -{ - const int packn = csrr_vlenb() / 2; - - const int num_input = weight_data_size / num_output; - - if (bottom_blob.dims == 2 && bottom_blob.w == num_input) - { - // gemm - int h = bottom_blob.h; - size_t elemsize = bottom_blob.elemsize; - int elempack = bottom_blob.elempack; - - top_blob.create(num_output, h, elemsize, elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - int num_output_elempack = opt.use_packing_layout && num_output % packn == 0 ? packn : 1; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int j = 0; j < h; j++) - { - if (elempack == packn && num_output_elempack == packn) - { - const size_t vl = vsetvl_e16m1(packn); - - __fp16* outptr = top_blob.row<__fp16>(j); - - for (int p = 0; p < num_output / num_output_elempack; p++) - { - for (int l = 0; l < packn; l++) - { - const __fp16* kptr = (const __fp16*)weight_data_tm + num_input * p * packn + l; - const __fp16* m = bottom_blob.row(j); - - vfloat16m1_t _sum = vfmv_v_f_f16m1((__fp16)0.f, vl); - - if (bias_term) - { - _sum = vfmv_v_f_f16m1(((const __fp16*)bias_data_fp16)[p * packn + l], vl); - } - - int n = num_input; - while (n > 0) - { - vfloat16m1_t _val = vle16_v_f16m1(m, vl); - - _sum = vfmacc_vf_f16m1(_sum, *kptr, _val, vl); - - m += packn; - kptr += packn; - n -= 1; - } - - _sum = activation_ps(_sum, activation_type, activation_params, vl); - - vse16_v_f16m1(outptr, _sum, vl); - outptr += packn; - } - } - } - - if (elempack == 1 && num_output_elempack == packn) - { - const size_t vl = vsetvl_e16m1(packn); - - __fp16* outptr = top_blob.row<__fp16>(j); - - for (int p = 0; p < num_output / num_output_elempack; p++) - { - const __fp16* kptr = (const __fp16*)weight_data_tm + num_input * p * packn; - const __fp16* m = bottom_blob.row(j); - - vfloat16m1_t _sum = vfmv_v_f_f16m1(0.f, vl); - - if (bias_term) - { - _sum = vle16_v_f16m1((const __fp16*)bias_data_fp16 + p * packn, vl); - } - - int n = num_input; - while (n > 0) - { - vfloat16m1_t _w = vle16_v_f16m1(kptr, vl); - - _sum = vfmacc_vf_f16m1(_sum, *m, _w, vl); - - m += 1; - kptr += packn; - n -= 1; - } - - _sum = activation_ps(_sum, activation_type, activation_params, vl); - - vse16_v_f16m1(outptr, _sum, vl); - outptr += packn; - } - } - - if (elempack == packn && num_output_elempack == 1) - { - const size_t vl = vsetvl_e16m1(packn); - - __fp16* outptr = top_blob.row<__fp16>(j); - - for (int p = 0; p < num_output; p++) - { - const __fp16* kptr = (const __fp16*)weight_data_tm + num_input * p; - const __fp16* m = bottom_blob.row(j); - - vfloat16m1_t _sum = vfmv_v_f_f16m1(0.f, vl); - - if (bias_term) - { - _sum = vfmv_v_f_f16m1(((const __fp16*)bias_data_fp16)[p], vl); - } - - int n = num_input; - while (n > 0) - { - vfloat16m1_t _val = vle16_v_f16m1(m, vl); - - _sum = vfmacc_vf_f16m1(_sum, *kptr, _val, vl); - - m += packn; - kptr += 1; - n -= 1; - } - - _sum = activation_ps(_sum, activation_type, activation_params, vl); - - vse16_v_f16m1(outptr, _sum, vl); - outptr += packn; - } - } - - if (elempack == 1 && num_output_elempack == 1) - { - __fp16* outptr = top_blob.row<__fp16>(j); - - for (int p = 0; p < num_output; p++) - { - const __fp16* kptr = (const __fp16*)weight_data_tm + num_input * p; - const __fp16* m = bottom_blob.row(j); - - float sum = 0.f; - - if (bias_term) - { - sum = bias_data[p]; - } - - for (int i = 0; i < num_input; i++) - { - sum += (float)(m[i] * kptr[i]); - } - - sum = activation_ss(sum, activation_type, activation_params); - - outptr[0] = (__fp16)sum; - outptr += 1; - } - } - } - - return 0; - } - - // flatten - Mat bottom_blob_flattened = bottom_blob; - if (bottom_blob.dims != 1) - { - Option opt_flatten = opt; - opt_flatten.blob_allocator = opt.workspace_allocator; - - flatten->forward(bottom_blob, bottom_blob_flattened, opt_flatten); - } - - size_t elemsize = bottom_blob_flattened.elemsize; - int elempack = bottom_blob_flattened.elempack; - - int out_elempack = opt.use_packing_layout && num_output % packn == 0 ? packn : 1; - size_t out_elemsize = elemsize / elempack * out_elempack; - - top_blob.create(num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (out_elempack == packn) - { - // num_output - #pragma omp parallel for num_threads(opt.num_threads) - for (int p = 0; p < num_output / out_elempack; p++) - { - const size_t vl = vsetvl_e16m1(packn); - vfloat16m1_t _sum = vfmv_v_f_f16m1(0.f, vl); - - if (bias_term) - { - _sum = vle16_v_f16m1((const __fp16*)bias_data_fp16 + p * packn, vl); - } - - const __fp16* kptr = weight_data_tm.row(p); - - const __fp16* sptr = bottom_blob_flattened; - - int n = num_input; - while (n > 0) - { - vfloat16m1_t _w = vle16_v_f16m1(kptr, vl); - - _sum = vfmacc_vf_f16m1(_sum, *sptr, _w, vl); - - sptr += 1; - kptr += packn; - n -= 1; - } - - _sum = activation_ps(_sum, activation_type, activation_params, vl); - - __fp16* outptr = (__fp16*)top_blob; - vse16_v_f16m1(outptr + p * packn, _sum, vl); - } - } - - if (out_elempack == 1) - { - // num_output - #pragma omp parallel for num_threads(opt.num_threads) - for (int p = 0; p < num_output; p++) - { - float sum = 0.f; - - if (bias_term) - sum = bias_data[p]; - - const __fp16* kptr = weight_data_tm.row<__fp16>(p); - - const __fp16* sptr = bottom_blob_flattened; - - int i = 0; - for (; i < num_input; i++) - { - __fp16 v = *sptr; - __fp16 k = *kptr; - - sum += (float)(v * k); - - sptr++; - kptr++; - } - - sum = activation_ss(sum, activation_type, activation_params); - - __fp16* outptr = (__fp16*)top_blob; - outptr[p] = (__fp16)sum; - } - } - - return 0; -} -#endif // __riscv_vector && __riscv_zfh - } // namespace ncnn diff --git a/src/layer/riscv/innerproduct_riscv.h b/src/layer/riscv/innerproduct_riscv.h index d3056d5801d..17be502d0b1 100644 --- a/src/layer/riscv/innerproduct_riscv.h +++ b/src/layer/riscv/innerproduct_riscv.h @@ -30,7 +30,7 @@ class InnerProduct_riscv : public InnerProduct virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; protected: -#if __riscv_vector && __riscv_zfh +#if NCNN_ZFH int create_pipeline_fp16s(const Option& opt); int forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; diff --git a/src/layer/riscv/innerproduct_riscv_zfh.cpp b/src/layer/riscv/innerproduct_riscv_zfh.cpp new file mode 100644 index 00000000000..347cca0a640 --- /dev/null +++ b/src/layer/riscv/innerproduct_riscv_zfh.cpp @@ -0,0 +1,633 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "innerproduct_riscv.h" + +#if __riscv_vector +#include +#endif // __riscv_vector +#include "riscv_activation.h" +#include "riscv_usability.h" + +namespace ncnn { + +#if NCNN_ZFH +int InnerProduct_riscv::create_pipeline_fp16s(const Option& opt) +{ +#if __riscv_zvfh + const int packn = csrr_vlenb() / 2; +#endif // __riscv_zvfh + + const int num_input = weight_data_size / num_output; + + int out_elempack = 1; + +#if __riscv_zvfh + if (opt.use_packing_layout) + { + out_elempack = num_output % packn == 0 ? packn : 1; + } +#endif // __riscv_zvfh + + // src = inch-outch + // dst = pb-inch-outch/pb + { + Mat weight_data_r2 = weight_data.reshape(num_input, num_output); + + weight_data_tm.create(num_input, num_output / out_elempack, (size_t)2u * out_elempack, out_elempack); + + for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack) + { + __fp16* g0 = weight_data_tm.row<__fp16>(q / out_elempack); + + for (int p = 0; p < num_input; p++) + { + for (int j = 0; j < out_elempack; j++) + { + *g0++ = (__fp16)(weight_data_r2.row(q + j)[p]); + } + } + } + } + + ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt); + + if (opt.lightmode) + weight_data.release(); + + return 0; +} + +int InnerProduct_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ +#if __riscv_zvfh + const int packn = csrr_vlenb() / 2; +#endif // __riscv_zvfh + + const int num_input = weight_data_size / num_output; + + if (bottom_blob.dims == 2 && bottom_blob.w == num_input) + { + // gemm + int h = bottom_blob.h; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + top_blob.create(num_output, h, elemsize, elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + int num_output_elempack = 1; +#if __riscv_zvfh + if (opt.use_packing_layout) + { + num_output_elempack = num_output % packn == 0 ? packn : 1; + } +#endif // __riscv_zvfh + + #pragma omp parallel for num_threads(opt.num_threads) + for (int j = 0; j < h; j++) + { +#if __riscv_zvfh + if (elempack == packn && num_output_elempack == packn) + { + const size_t vl = __riscv_vsetvl_e16m1(packn); + + __fp16* outptr = top_blob.row<__fp16>(j); + + for (int p = 0; p < num_output / num_output_elempack; p++) + { + for (int l = 0; l < packn; l++) + { + const __fp16* kptr = (const __fp16*)weight_data_tm + num_input * p * packn + l; + const __fp16* m = bottom_blob.row(j); + + vfloat32m2_t _sum = __riscv_vfmv_v_f_f32m2(0.f, vl); + + if (bias_term) + { + _sum = __riscv_vfmv_v_f_f32m2(bias_data[p * packn + l], vl); + } + + int n = num_input; + while (n > 0) + { + vfloat32m2_t _val = __riscv_vfwcvt_f_f_v_f32m2(__riscv_vle16_v_f16m1(m, vl), vl); + + _sum = __riscv_vfmacc_vf_f32m2(_sum, *kptr, _val, vl); + + m += packn; + kptr += packn; + n -= 1; + } + + _sum = activation_ps(_sum, activation_type, activation_params, vl); + + __riscv_vse16_v_f16m1(outptr, __riscv_vfncvt_f_f_w_f16m1(_sum, vl), vl); + outptr += packn; + } + } + } + + if (elempack == 1 && num_output_elempack == packn) + { + const size_t vl = __riscv_vsetvl_e16m1(packn); + + __fp16* outptr = top_blob.row<__fp16>(j); + + for (int p = 0; p < num_output / num_output_elempack; p++) + { + const __fp16* kptr = (const __fp16*)weight_data_tm + num_input * p * packn; + const __fp16* m = bottom_blob.row(j); + + vfloat32m2_t _sum = __riscv_vfmv_v_f_f32m2(0.f, vl); + + if (bias_term) + { + _sum = __riscv_vle32_v_f32m2((const float*)bias_data + p * packn, vl); + } + + int n = num_input; + while (n > 0) + { + vfloat32m2_t _w = __riscv_vfwcvt_f_f_v_f32m2(__riscv_vle16_v_f16m1(kptr, vl), vl); + + _sum = __riscv_vfmacc_vf_f32m2(_sum, *m, _w, vl); + + m += 1; + kptr += packn; + n -= 1; + } + + _sum = activation_ps(_sum, activation_type, activation_params, vl); + + __riscv_vse16_v_f16m1(outptr, __riscv_vfncvt_f_f_w_f16m1(_sum, vl), vl); + outptr += packn; + } + } + + if (elempack == packn && num_output_elempack == 1) + { + const size_t vl = __riscv_vsetvl_e16m1(packn); + + __fp16* outptr = top_blob.row<__fp16>(j); + + for (int p = 0; p < num_output; p++) + { + const __fp16* kptr = (const __fp16*)weight_data_tm + num_input * p; + const __fp16* m = bottom_blob.row(j); + + vfloat32m2_t _sum = __riscv_vfmv_v_f_f32m2(0.f, vl); + + if (bias_term) + { + _sum = __riscv_vfmv_v_f_f32m2(bias_data[p], vl); + } + + int n = num_input; + while (n > 0) + { + vfloat32m2_t _val = __riscv_vfwcvt_f_f_v_f32m2(__riscv_vle16_v_f16m1(m, vl), vl); + + _sum = __riscv_vfmacc_vf_f32m2(_sum, *kptr, _val, vl); + + m += packn; + kptr += 1; + n -= 1; + } + + _sum = activation_ps(_sum, activation_type, activation_params, vl); + + __riscv_vse16_v_f16m1(outptr, __riscv_vfncvt_f_f_w_f16m1(_sum, vl), vl); + outptr += packn; + } + } +#endif // __riscv_zvfh + + if (elempack == 1 && num_output_elempack == 1) + { + __fp16* outptr = top_blob.row<__fp16>(j); + + for (int p = 0; p < num_output; p++) + { + const __fp16* kptr = (const __fp16*)weight_data_tm + num_input * p; + const __fp16* m = bottom_blob.row(j); + + float sum = 0.f; + + if (bias_term) + { + sum = bias_data[p]; + } + + for (int i = 0; i < num_input; i++) + { + sum += (float)m[i] * (float)kptr[i]; + } + + sum = activation_ss(sum, activation_type, activation_params); + + outptr[0] = (__fp16)sum; + outptr += 1; + } + } + } + + return 0; + } + + // flatten + Mat bottom_blob_flattened = bottom_blob; + if (bottom_blob.dims != 1) + { + Option opt_flatten = opt; + opt_flatten.blob_allocator = opt.workspace_allocator; + + flatten->forward(bottom_blob, bottom_blob_flattened, opt_flatten); + } + + size_t elemsize = bottom_blob_flattened.elemsize; + int elempack = bottom_blob_flattened.elempack; + + int out_elempack = 1; +#if __riscv_zvfh + if (opt.use_packing_layout) + { + out_elempack = num_output % packn == 0 ? packn : 1; + } +#endif // __riscv_zvfh + size_t out_elemsize = elemsize / elempack * out_elempack; + + top_blob.create(num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + +#if __riscv_zvfh + if (out_elempack == packn) + { + // num_output + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < num_output / out_elempack; p++) + { + const size_t vl = __riscv_vsetvl_e16m1(packn); + vfloat32m2_t _sum = __riscv_vfmv_v_f_f32m2(0.f, vl); + + if (bias_term) + { + _sum = __riscv_vle32_v_f32m2((const float*)bias_data + p * packn, vl); + } + + const __fp16* kptr = weight_data_tm.row(p); + + const __fp16* sptr = bottom_blob_flattened; + + int n = num_input; + while (n > 0) + { + vfloat32m2_t _w = __riscv_vfwcvt_f_f_v_f32m2(__riscv_vle16_v_f16m1(kptr, vl), vl); + + _sum = __riscv_vfmacc_vf_f32m2(_sum, (float)(*sptr), _w, vl); + + sptr += 1; + kptr += packn; + n -= 1; + } + + _sum = activation_ps(_sum, activation_type, activation_params, vl); + + __fp16* outptr = (__fp16*)top_blob; + __riscv_vse16_v_f16m1(outptr + p * packn, __riscv_vfncvt_f_f_w_f16m1(_sum, vl), vl); + } + } +#endif // __riscv_zvfh + + if (out_elempack == 1) + { + // num_output + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < num_output; p++) + { + float sum = 0.f; + + if (bias_term) + sum = bias_data[p]; + + const __fp16* kptr = weight_data_tm.row<__fp16>(p); + + const __fp16* sptr = bottom_blob_flattened; + + int i = 0; + for (; i < num_input; i++) + { + float v = (float)(*sptr); + float k = (float)(*kptr); + + sum += v * k; + + sptr++; + kptr++; + } + + sum = activation_ss(sum, activation_type, activation_params); + + __fp16* outptr = (__fp16*)top_blob; + outptr[p] = (__fp16)sum; + } + } + + return 0; +} + +int InnerProduct_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ +#if __riscv_zvfh + const int packn = csrr_vlenb() / 2; +#endif // __riscv_zvfh + + const int num_input = weight_data_size / num_output; + + if (bottom_blob.dims == 2 && bottom_blob.w == num_input) + { + // gemm + int h = bottom_blob.h; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + top_blob.create(num_output, h, elemsize, elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + int num_output_elempack = 1; +#if __riscv_zvfh + if (opt.use_packing_layout) + { + num_output_elempack = num_output % packn == 0 ? packn : 1; + } +#endif // __riscv_zvfh + + #pragma omp parallel for num_threads(opt.num_threads) + for (int j = 0; j < h; j++) + { +#if __riscv_zvfh + if (elempack == packn && num_output_elempack == packn) + { + const size_t vl = __riscv_vsetvl_e16m1(packn); + + __fp16* outptr = top_blob.row<__fp16>(j); + + for (int p = 0; p < num_output / num_output_elempack; p++) + { + for (int l = 0; l < packn; l++) + { + const __fp16* kptr = (const __fp16*)weight_data_tm + num_input * p * packn + l; + const __fp16* m = bottom_blob.row(j); + + vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl); + + if (bias_term) + { + _sum = __riscv_vfmv_v_f_f16m1(((const __fp16*)bias_data_fp16)[p * packn + l], vl); + } + + int n = num_input; + while (n > 0) + { + vfloat16m1_t _val = __riscv_vle16_v_f16m1(m, vl); + + _sum = __riscv_vfmacc_vf_f16m1(_sum, *kptr, _val, vl); + + m += packn; + kptr += packn; + n -= 1; + } + + _sum = activation_ps(_sum, activation_type, activation_params, vl); + + __riscv_vse16_v_f16m1(outptr, _sum, vl); + outptr += packn; + } + } + } + + if (elempack == 1 && num_output_elempack == packn) + { + const size_t vl = __riscv_vsetvl_e16m1(packn); + + __fp16* outptr = top_blob.row<__fp16>(j); + + for (int p = 0; p < num_output / num_output_elempack; p++) + { + const __fp16* kptr = (const __fp16*)weight_data_tm + num_input * p * packn; + const __fp16* m = bottom_blob.row(j); + + vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1(0.f, vl); + + if (bias_term) + { + _sum = __riscv_vle16_v_f16m1((const __fp16*)bias_data_fp16 + p * packn, vl); + } + + int n = num_input; + while (n > 0) + { + vfloat16m1_t _w = __riscv_vle16_v_f16m1(kptr, vl); + + _sum = __riscv_vfmacc_vf_f16m1(_sum, *m, _w, vl); + + m += 1; + kptr += packn; + n -= 1; + } + + _sum = activation_ps(_sum, activation_type, activation_params, vl); + + __riscv_vse16_v_f16m1(outptr, _sum, vl); + outptr += packn; + } + } + + if (elempack == packn && num_output_elempack == 1) + { + const size_t vl = __riscv_vsetvl_e16m1(packn); + + __fp16* outptr = top_blob.row<__fp16>(j); + + for (int p = 0; p < num_output; p++) + { + const __fp16* kptr = (const __fp16*)weight_data_tm + num_input * p; + const __fp16* m = bottom_blob.row(j); + + vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1(0.f, vl); + + if (bias_term) + { + _sum = __riscv_vfmv_v_f_f16m1(((const __fp16*)bias_data_fp16)[p], vl); + } + + int n = num_input; + while (n > 0) + { + vfloat16m1_t _val = __riscv_vle16_v_f16m1(m, vl); + + _sum = __riscv_vfmacc_vf_f16m1(_sum, *kptr, _val, vl); + + m += packn; + kptr += 1; + n -= 1; + } + + _sum = activation_ps(_sum, activation_type, activation_params, vl); + + __riscv_vse16_v_f16m1(outptr, _sum, vl); + outptr += packn; + } + } +#endif // __riscv_zvfh + + if (elempack == 1 && num_output_elempack == 1) + { + __fp16* outptr = top_blob.row<__fp16>(j); + + for (int p = 0; p < num_output; p++) + { + const __fp16* kptr = (const __fp16*)weight_data_tm + num_input * p; + const __fp16* m = bottom_blob.row(j); + + float sum = 0.f; + + if (bias_term) + { + sum = bias_data[p]; + } + + for (int i = 0; i < num_input; i++) + { + sum += (float)(m[i] * kptr[i]); + } + + sum = activation_ss(sum, activation_type, activation_params); + + outptr[0] = (__fp16)sum; + outptr += 1; + } + } + } + + return 0; + } + + // flatten + Mat bottom_blob_flattened = bottom_blob; + if (bottom_blob.dims != 1) + { + Option opt_flatten = opt; + opt_flatten.blob_allocator = opt.workspace_allocator; + + flatten->forward(bottom_blob, bottom_blob_flattened, opt_flatten); + } + + size_t elemsize = bottom_blob_flattened.elemsize; + int elempack = bottom_blob_flattened.elempack; + + int out_elempack = 1; +#if __riscv_zvfh + if (opt.use_packing_layout) + { + out_elempack = num_output % packn == 0 ? packn : 1; + } +#endif // __riscv_zvfh + size_t out_elemsize = elemsize / elempack * out_elempack; + + top_blob.create(num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + +#if __riscv_zvfh + if (out_elempack == packn) + { + // num_output + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < num_output / out_elempack; p++) + { + const size_t vl = __riscv_vsetvl_e16m1(packn); + vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1(0.f, vl); + + if (bias_term) + { + _sum = __riscv_vle16_v_f16m1((const __fp16*)bias_data_fp16 + p * packn, vl); + } + + const __fp16* kptr = weight_data_tm.row(p); + + const __fp16* sptr = bottom_blob_flattened; + + int n = num_input; + while (n > 0) + { + vfloat16m1_t _w = __riscv_vle16_v_f16m1(kptr, vl); + + _sum = __riscv_vfmacc_vf_f16m1(_sum, *sptr, _w, vl); + + sptr += 1; + kptr += packn; + n -= 1; + } + + _sum = activation_ps(_sum, activation_type, activation_params, vl); + + __fp16* outptr = (__fp16*)top_blob; + __riscv_vse16_v_f16m1(outptr + p * packn, _sum, vl); + } + } +#endif // __riscv_zvfh + + if (out_elempack == 1) + { + // num_output + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < num_output; p++) + { + float sum = 0.f; + + if (bias_term) + sum = bias_data[p]; + + const __fp16* kptr = weight_data_tm.row<__fp16>(p); + + const __fp16* sptr = bottom_blob_flattened; + + int i = 0; + for (; i < num_input; i++) + { + __fp16 v = *sptr; + __fp16 k = *kptr; + + sum += (float)(v * k); + + sptr++; + kptr++; + } + + sum = activation_ss(sum, activation_type, activation_params); + + __fp16* outptr = (__fp16*)top_blob; + outptr[p] = (__fp16)sum; + } + } + + return 0; +} +#endif // NCNN_ZFH + +} // namespace ncnn diff --git a/src/layer/riscv/instancenorm_riscv.cpp b/src/layer/riscv/instancenorm_riscv.cpp index 20cf5d94c7d..fccc61543b6 100644 --- a/src/layer/riscv/instancenorm_riscv.cpp +++ b/src/layer/riscv/instancenorm_riscv.cpp @@ -20,22 +20,28 @@ #include "riscv_usability.h" +#include "cpu.h" + namespace ncnn { InstanceNorm_riscv::InstanceNorm_riscv() { #if __riscv_vector support_packing = true; -#if __riscv_zfh - support_fp16_storage = true; -#endif #endif // __riscv_vector +#if NCNN_ZFH +#if __riscv_vector + support_fp16_storage = cpu_support_riscv_zvfh(); +#else + support_fp16_storage = cpu_support_riscv_zfh(); +#endif +#endif } int InstanceNorm_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const { -// x = (x - mean) / (sqrt(var + eps)) * gamma + beta -#if __riscv_vector +#if NCNN_ZFH int elembits = bottom_top_blob.elembits(); + if (opt.use_fp16_storage && elembits == 16) { if (opt.use_fp16_arithmetic) @@ -43,21 +49,19 @@ int InstanceNorm_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) else return forward_inplace_fp16s(bottom_top_blob, opt); } +#endif + + // x = (x - mean) / (sqrt(var + eps)) * gamma + beta + int elempack = bottom_top_blob.elempack; -#endif // __riscv_vector int w = bottom_top_blob.w; int h = bottom_top_blob.h; int c = bottom_top_blob.c; int size = w * h; int dims = bottom_top_blob.dims; -#if __riscv_vector if (elempack == 1) -#endif // __riscv_vector { -#if __riscv_vector - size = elempack * size; -#endif #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < c; q++) { @@ -66,23 +70,23 @@ int InstanceNorm_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) // mean and var float sum = 0.f; float sqsum = 0.f; -#if __riscv_vector - vfloat32m1_t _sum = vfmv_s_f_f32m1(vundefined_f32m1(), 0.f, vsetvlmax_e32m1()); - vfloat32m1_t _sqsum = vfmv_s_f_f32m1(vundefined_f32m1(), 0.f, vsetvlmax_e32m1()); +#if __riscv_vector && !defined(C906) + vfloat32m1_t _sum = __riscv_vfmv_s_f_f32m1(0.f, __riscv_vsetvlmax_e32m1()); + vfloat32m1_t _sqsum = __riscv_vfmv_s_f_f32m1(0.f, __riscv_vsetvlmax_e32m1()); { int n = size; float* ptr_sum = ptr; while (n > 0) { - size_t vl = vsetvl_e32m8(n); - vfloat32m8_t _p = vle32_v_f32m8(ptr_sum, vl); - _sum = vfredusum_vs_f32m8_f32m1(_sum, _p, /* scalar */ _sum, vl); - // _sqsum = vfredosum_vs_f32m8_f32m1(_sqsum, vfmul_vv_f32m8(_p, _p, vl), /* scalar */ _sqsum, vl); + size_t vl = __riscv_vsetvl_e32m8(n); + vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr_sum, vl); + _sum = __riscv_vfredusum_vs_f32m8_f32m1(_p, _sum, vl); + // _sqsum = __riscv_vfredosum_vs_f32m8_f32m1(__riscv_vfmul_vv_f32m8(_p, _p, vl), _sqsum, vl); ptr_sum += vl; n -= vl; } } - sum = vfmv_f_s_f32m1_f32(_sum); + sum = __riscv_vfmv_f_s_f32m1_f32(_sum); #else for (int i = 0; i < size; i++) { @@ -91,21 +95,21 @@ int InstanceNorm_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) } #endif // __riscv_vector float mean = sum / size; -#if __riscv_vecotr +#if __riscv_vector && !defined(C906) { int n = size; float* ptr_sqsum = ptr; while (n > 0) { - size_t vl = vsetvl_e32m8(n); - vfloat32m8_t _p = vle32_v_f32m8(ptr_sqsum, vl); - _p = vfsub_vf_f32m8(_p, mean, vl); - _sqsum = vfredosum_vs_f32m8_f32m1(_sqsum, vfmul_vv_f32m8(_p, _p, vl), /* scalar */ _sqsum, vl); + size_t vl = __riscv_vsetvl_e32m8(n); + vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr_sqsum, vl); + _p = __riscv_vfsub_vf_f32m8(_p, mean, vl); + _sqsum = __riscv_vfredosum_vs_f32m8_f32m1(__riscv_vfmul_vv_f32m8(_p, _p, vl), _sqsum, vl); n -= vl; ptr_sqsum += vl; } } - sqsum = vfmv_f_s_f32m1_f32(_sqsum); + sqsum = __riscv_vfmv_f_s_f32m1_f32(_sqsum); #else float tmp = 0.f; for (int i = 0; i < size; i++) @@ -139,11 +143,11 @@ int InstanceNorm_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) float* ptr_store = ptr; while (n > 0) { - size_t vl = vsetvl_e32m8(n); - vfloat32m8_t _p = vle32_v_f32m8(ptr_store, vl); - _p = vfmul_vf_f32m8(_p, a, vl); - _p = vfadd_vf_f32m8(_p, b, vl); - vse32_v_f32m8(ptr_store, _p, vl); + size_t vl = __riscv_vsetvl_e32m8(n); + vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr_store, vl); + _p = __riscv_vfmul_vf_f32m8(_p, a, vl); + _p = __riscv_vfadd_vf_f32m8(_p, b, vl); + __riscv_vse32_v_f32m8(ptr_store, _p, vl); n -= vl; ptr_store += vl; } @@ -162,28 +166,28 @@ int InstanceNorm_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const int packn = csrr_vlenb() / 4; if (elempack == packn) { - const size_t vl = vsetvl_e32m1(packn); + const size_t vl = __riscv_vsetvl_e32m1(packn); #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < c; q++) { float* ptr = bottom_top_blob.channel(q); - vfloat32m1_t _sum = vfmv_v_f_f32m1(0.f, vl); - vfloat32m1_t _sqsum = vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _sum = __riscv_vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _sqsum = __riscv_vfmv_v_f_f32m1(0.f, vl); for (int i = 0; i < size; i++) { - vfloat32m1_t _p = vle32_v_f32m1(ptr + vl * i, vl); - _sum = vfadd_vv_f32m1(_p, _sum, vl); - // _sqsum = vfmadd_vv_f32m1(_p,_p,_sqsum,vl); + vfloat32m1_t _p = __riscv_vle32_v_f32m1(ptr + vl * i, vl); + _sum = __riscv_vfadd_vv_f32m1(_p, _sum, vl); + // _sqsum = __riscv_vfmadd_vv_f32m1(_p,_p,_sqsum,vl); } - vfloat32m1_t _mean = vfdiv_vf_f32m1(_sum, size, vl); + vfloat32m1_t _mean = __riscv_vfdiv_vf_f32m1(_sum, size, vl); for (int i = 0; i < size; i++) { - vfloat32m1_t _p = vle32_v_f32m1(ptr + vl * i, vl); - _p = vfsub_vv_f32m1(_p, _mean, vl); - _sqsum = vfmadd_vv_f32m1(_p, _p, _sqsum, vl); + vfloat32m1_t _p = __riscv_vle32_v_f32m1(ptr + vl * i, vl); + _p = __riscv_vfsub_vv_f32m1(_p, _mean, vl); + _sqsum = __riscv_vfmadd_vv_f32m1(_p, _p, _sqsum, vl); } - vfloat32m1_t _var = vfdiv_vf_f32m1(_sqsum, size, vl); + vfloat32m1_t _var = __riscv_vfdiv_vf_f32m1(_sqsum, size, vl); // the var maybe minus due to accuracy //float var = sqsum / size - mean * mean; @@ -191,22 +195,22 @@ int InstanceNorm_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) vfloat32m1_t _b; if (affine) { - vfloat32m1_t _gamma = vle32_v_f32m1((const float*)gamma_data + q * vl, vl); - vfloat32m1_t _beta = vle32_v_f32m1((const float*)beta_data + q * vl, vl); - _a = vfdiv_vv_f32m1(_gamma, vfsqrt_v_f32m1(vfadd_vf_f32m1(_var, eps, vl), vl), vl); - _b = vfnmsub_vv_f32m1(_a, _mean, _beta, vl); + vfloat32m1_t _gamma = __riscv_vle32_v_f32m1((const float*)gamma_data + q * vl, vl); + vfloat32m1_t _beta = __riscv_vle32_v_f32m1((const float*)beta_data + q * vl, vl); + _a = __riscv_vfdiv_vv_f32m1(_gamma, __riscv_vfsqrt_v_f32m1(__riscv_vfadd_vf_f32m1(_var, eps, vl), vl), vl); + _b = __riscv_vfnmsub_vv_f32m1(_a, _mean, _beta, vl); } else { - _a = vfrdiv_vf_f32m1(vfsqrt_v_f32m1(vfadd_vf_f32m1(_var, eps, vl), vl), 1.f, vl); - _b = vfmul_vv_f32m1(_a, _mean, vl); - _b = vfsgnjn_vv_f32m1(_b, _b, vl); + _a = __riscv_vfrdiv_vf_f32m1(__riscv_vfsqrt_v_f32m1(__riscv_vfadd_vf_f32m1(_var, eps, vl), vl), 1.f, vl); + _b = __riscv_vfmul_vv_f32m1(_a, _mean, vl); + _b = __riscv_vfsgnjn_vv_f32m1(_b, _b, vl); } for (int i = 0; i < size; i++) { - vfloat32m1_t _p = vle32_v_f32m1(ptr + i * vl, vl); - _p = vfmadd_vv_f32m1(_p, _a, _b, vl); - vse32_v_f32m1(ptr + i * vl, _p, vl); + vfloat32m1_t _p = __riscv_vle32_v_f32m1(ptr + i * vl, vl); + _p = __riscv_vfmadd_vv_f32m1(_p, _a, _b, vl); + __riscv_vse32_v_f32m1(ptr + i * vl, _p, vl); } } return 0; @@ -215,298 +219,4 @@ int InstanceNorm_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) return 0; } -#if __riscv_vector && __riscv_zfh -int InstanceNorm_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const -{ - // x = (x - mean) / (sqrt(var + eps)) * gamma + beta - - int elempack = bottom_top_blob.elempack; - - int w = bottom_top_blob.w; - int h = bottom_top_blob.h; - int c = bottom_top_blob.c; - int size = w * h; - - int dims = bottom_top_blob.dims; - if (elempack == 1) - { - size = elempack * size; - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < c; q++) - { - __fp16* ptr = bottom_top_blob.channel(q); - - // mean and var - float sum = 0.f; - float sqsum = 0.f; - vfloat32m1_t _sum = vfmv_s_f_f32m1(vundefined_f32m1(), 0.f, vsetvlmax_e32m1()); - vfloat32m1_t _sqsum = vfmv_s_f_f32m1(vundefined_f32m1(), 0.f, vsetvlmax_e32m1()); - { - int n = size; - __fp16* ptr_sum = ptr; - while (n > 0) - { - size_t vl = vsetvl_e32m8(n); - vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr_sum, vl), vl); - _sum = vfredusum_vs_f32m8_f32m1(_sum, _p, /* scalar */ _sum, vl); - // _sqsum = vfredosum_vs_f32m8_f32m1(_sqsum, vfmul_vv_f32m8(_p, _p, vl), /* scalar */ _sqsum, vl); - ptr_sum += vl; - n -= vl; - } - } - sum = vfmv_f_s_f32m1_f32(_sum); - float mean = sum / size; - { - int n = size; - __fp16* ptr_sqsum = ptr; - while (n > 0) - { - size_t vl = vsetvl_e32m8(n); - vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr_sqsum, vl), vl); - _p = vfsub_vf_f32m8(_p, mean, vl); - _sqsum = vfredosum_vs_f32m8_f32m1(_sqsum, vfmul_vv_f32m8(_p, _p, vl), /* scalar */ _sqsum, vl); - n -= vl; - ptr_sqsum += vl; - } - } - sqsum = vfmv_f_s_f32m1_f32(_sqsum); - float var = sqsum / size; - // the var maybe minus due to accuracy - //float var = sqsum / size - mean * mean; - - float a; - float b; - if (affine) - { - float gamma = gamma_data[q]; - float beta = beta_data[q]; - - a = gamma / (sqrtf(var + eps)); - b = -mean * a + beta; - } - else - { - a = 1.f / (sqrtf(var + eps)); - b = -mean * a; - } - { - int n = size; - __fp16* ptr_store = ptr; - while (n > 0) - { - size_t vl = vsetvl_e32m8(n); - vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr_store, vl), vl); - _p = vfmul_vf_f32m8(_p, a, vl); - _p = vfadd_vf_f32m8(_p, b, vl); - vse16_v_f16m4(ptr_store, vfncvt_f_f_w_f16m4(_p, vl), vl); - n -= vl; - ptr_store += vl; - } - } - } - return 0; - } - - const int packn = csrr_vlenb() / 2; - if (elempack == packn) - { - const size_t vl = vsetvl_e16m1(packn); - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < c; q++) - { - __fp16* ptr = bottom_top_blob.channel(q); - vfloat32m2_t _sum = vfmv_v_f_f32m2(0.f, vl); - vfloat32m2_t _sqsum = vfmv_v_f_f32m2(0.f, vl); - - for (int i = 0; i < size; i++) - { - vfloat32m2_t _p = vfwcvt_f_f_v_f32m2(vle16_v_f16m1(ptr + vl * i, vl), vl); - _sum = vfadd_vv_f32m2(_p, _sum, vl); - // _sqsum = vfmadd_vv_f32m2(_p,_p,_sqsum,vl); - } - vfloat32m2_t _mean = vfdiv_vf_f32m2(_sum, size, vl); - for (int i = 0; i < size; i++) - { - vfloat32m2_t _p = vfwcvt_f_f_v_f32m2(vle16_v_f16m1(ptr + vl * i, vl), vl); - _p = vfsub_vv_f32m2(_p, _mean, vl); - _sqsum = vfmadd_vv_f32m2(_p, _p, _sqsum, vl); - } - vfloat32m2_t _var = vfdiv_vf_f32m2(_sqsum, size, vl); - // the var maybe minus due to accuracy - //float var = sqsum / size - mean * mean; - - vfloat32m2_t _a; - vfloat32m2_t _b; - if (affine) - { - vfloat32m2_t _gamma = vle32_v_f32m2((const float*)gamma_data + q * vl, vl); - vfloat32m2_t _beta = vle32_v_f32m2((const float*)beta_data + q * vl, vl); - _a = vfdiv_vv_f32m2(_gamma, vfsqrt_v_f32m2(vfadd_vf_f32m2(_var, eps, vl), vl), vl); - _b = vfnmsub_vv_f32m2(_a, _mean, _beta, vl); - } - else - { - _a = vfrdiv_vf_f32m2(vfsqrt_v_f32m2(vfadd_vf_f32m2(_var, eps, vl), vl), 1.f, vl); - _b = vfmul_vv_f32m2(_a, _mean, vl); - _b = vfsgnjn_vv_f32m2(_b, _b, vl); - } - for (int i = 0; i < size; i++) - { - vfloat32m2_t _p = vfwcvt_f_f_v_f32m2(vle16_v_f16m1(ptr + i * vl, vl), vl); - _p = vfmadd_vv_f32m2(_p, _a, _b, vl); - vse16_v_f16m1(ptr + i * vl, vfncvt_f_f_w_f16m1(_p, vl), vl); - } - } - return 0; - } - return 0; -} - -int InstanceNorm_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const -{ - // x = (x - mean) / (sqrt(var + eps)) * gamma + beta - int elempack = bottom_top_blob.elempack; - - int w = bottom_top_blob.w; - int h = bottom_top_blob.h; - int c = bottom_top_blob.c; - int size = w * h; - - int dims = bottom_top_blob.dims; - if (elempack == 1) - { - size = elempack * size; - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < c; q++) - { - __fp16* ptr = bottom_top_blob.channel(q); - - // mean and var - __fp16 sum = 0.f; - __fp16 sqsum = 0.f; - vfloat16m1_t _sum = vfmv_s_f_f16m1(vundefined_f16m1(), 0.f, vsetvlmax_e32m1()); - vfloat16m1_t _sqsum = vfmv_s_f_f16m1(vundefined_f16m1(), 0.f, vsetvlmax_e32m1()); - { - int n = size; - __fp16* ptr_sum = ptr; - while (n > 0) - { - size_t vl = vsetvl_e16m8(n); - vfloat16m8_t _p = vle16_v_f16m8(ptr_sum, vl); - _sum = vfredusum_vs_f16m8_f16m1(_sum, _p, /* scalar */ _sum, vl); - // _sqsum = vfredosum_vs_f16m8_f16m1(_sqsum, vfmul_vv_f16m8(_p, _p, vl), /* scalar */ _sqsum, vl); - ptr_sum += vl; - n -= vl; - } - } - sum = vfmv_f_s_f16m1_f16(_sum); - __fp16 mean = sum / size; - { - int n = size; - __fp16* ptr_sqsum = ptr; - while (n > 0) - { - size_t vl = vsetvl_e16m8(n); - vfloat16m8_t _p = vle16_v_f16m8(ptr_sqsum, vl); - _p = vfsub_vf_f16m8(_p, mean, vl); - _sqsum = vfredosum_vs_f16m8_f16m1(_sqsum, vfmul_vv_f16m8(_p, _p, vl), /* scalar */ _sqsum, vl); - n -= vl; - ptr_sqsum += vl; - } - } - sqsum = vfmv_f_s_f16m1_f16(_sqsum); - __fp16 var = sqsum / size; - // the var maybe minus due to accuracy - //float var = sqsum / size - mean * mean; - - __fp16 a; - __fp16 b; - if (affine) - { - float gamma = gamma_data[q]; - float beta = beta_data[q]; - - a = static_cast<__fp16>(gamma / (sqrt(var + eps))); - b = static_cast<__fp16>(-mean * a + beta); - } - else - { - a = static_cast<__fp16>(1.f / (sqrt(var + eps))); - b = static_cast<__fp16>(-mean * a); - } - { - int n = size; - __fp16* ptr_store = ptr; - while (n > 0) - { - size_t vl = vsetvl_e32m8(n); - vfloat16m8_t _p = vle16_v_f16m8(ptr_store, vl); - _p = vfmul_vf_f16m8(_p, a, vl); - _p = vfadd_vf_f16m8(_p, b, vl); - vse16_v_f16m8(ptr_store, _p, vl); - n -= vl; - ptr_store += vl; - } - } - } - return 0; - } - - const int packn = csrr_vlenb() / 2; - if (elempack == packn) - { - const size_t vl = vsetvl_e16m1(packn); - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < c; q++) - { - __fp16* ptr = bottom_top_blob.channel(q); - vfloat16m1_t _sum = vfmv_v_f_f16m1(0.f, vl); - vfloat16m1_t _sqsum = vfmv_v_f_f16m1(0.f, vl); - - for (int i = 0; i < size; i++) - { - vfloat16m1_t _p = vle16_v_f16m1(ptr + vl * i, vl); - _sum = vfadd_vv_f16m1(_p, _sum, vl); - // _sqsum = vfmadd_vv_f16m1(_p,_p,_sqsum,vl); - } - vfloat16m1_t _mean = vfdiv_vf_f16m1(_sum, size, vl); - for (int i = 0; i < size; i++) - { - vfloat16m1_t _p = vle16_v_f16m1(ptr + vl * i, vl); - _p = vfsub_vv_f16m1(_p, _mean, vl); - _sqsum = vfmadd_vv_f16m1(_p, _p, _sqsum, vl); - } - vfloat16m1_t _var = vfdiv_vf_f16m1(_sqsum, size, vl); - // the var maybe minus due to accuracy - //float var = sqsum / size - mean * mean; - - vfloat16m1_t _a; - vfloat16m1_t _b; - if (affine) - { - vfloat16m1_t _gamma = vfncvt_f_f_w_f16m1(vle32_v_f32m2((const float*)gamma_data + q * vl, vl), vl); - vfloat16m1_t _beta = vfncvt_f_f_w_f16m1(vle32_v_f32m2((const float*)beta_data + q * vl, vl), vl); - _a = vfdiv_vv_f16m1(_gamma, vfsqrt_v_f16m1(vfadd_vf_f16m1(_var, eps, vl), vl), vl); - _b = vfnmsub_vv_f16m1(_a, _mean, _beta, vl); - } - else - { - _a = vfrdiv_vf_f16m1(vfsqrt_v_f16m1(vfadd_vf_f16m1(_var, eps, vl), vl), 1.f, vl); - _b = vfmul_vv_f16m1(_a, _mean, vl); - _b = vfsgnjn_vv_f16m1(_b, _b, vl); - } - for (int i = 0; i < size; i++) - { - vfloat16m1_t _p = vle16_v_f16m1(ptr + i * vl, vl); - _p = vfmadd_vv_f16m1(_p, _a, _b, vl); - vse16_v_f16m1(ptr + i * vl, _p, vl); - } - } - return 0; - } - return 0; -} - -#endif // __riscv_vector && __riscv_zfh - -} // namespace ncnn \ No newline at end of file +} // namespace ncnn diff --git a/src/layer/riscv/instancenorm_riscv.h b/src/layer/riscv/instancenorm_riscv.h index b0d2e9004ac..006b8bfc1ad 100644 --- a/src/layer/riscv/instancenorm_riscv.h +++ b/src/layer/riscv/instancenorm_riscv.h @@ -26,11 +26,11 @@ class InstanceNorm_riscv : public InstanceNorm virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; protected: -#if __riscv_vector && __riscv_zfh +#if NCNN_ZFH int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const; int forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const; #endif }; } // namespace ncnn -#endif // LAYER_INSTANCENORM_RISCV_H \ No newline at end of file +#endif // LAYER_INSTANCENORM_RISCV_H diff --git a/src/layer/riscv/instancenorm_riscv_zfh.cpp b/src/layer/riscv/instancenorm_riscv_zfh.cpp new file mode 100644 index 00000000000..d3ad2cf15d5 --- /dev/null +++ b/src/layer/riscv/instancenorm_riscv_zfh.cpp @@ -0,0 +1,368 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "instancenorm_riscv.h" + +#if __riscv_vector +#include +#endif // __riscv_vector + +#include "riscv_usability.h" + +namespace ncnn { + +#if NCNN_ZFH +int InstanceNorm_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const +{ + // x = (x - mean) / (sqrt(var + eps)) * gamma + beta + + int elempack = bottom_top_blob.elempack; + + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + int c = bottom_top_blob.c; + int size = w * h; + + int dims = bottom_top_blob.dims; + if (elempack == 1) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < c; q++) + { + __fp16* ptr = bottom_top_blob.channel(q); + + // mean and var + float sum = 0.f; + float sqsum = 0.f; +#if __riscv_zvfh && !defined(C906) + vfloat32m1_t _sum = __riscv_vfmv_s_f_f32m1(0.f, __riscv_vsetvlmax_e32m1()); + vfloat32m1_t _sqsum = __riscv_vfmv_s_f_f32m1(0.f, __riscv_vsetvlmax_e32m1()); + { + int n = size; + __fp16* ptr_sum = ptr; + while (n > 0) + { + size_t vl = __riscv_vsetvl_e32m8(n); + vfloat32m8_t _p = __riscv_vfwcvt_f_f_v_f32m8(__riscv_vle16_v_f16m4(ptr_sum, vl), vl); + _sum = __riscv_vfredusum_vs_f32m8_f32m1(_p, /* scalar */ _sum, vl); + // _sqsum = __riscv_vfredosum_vs_f32m8_f32m1(__riscv_vfmul_vv_f32m8(_p, _p, vl), /* scalar */ _sqsum, vl); + ptr_sum += vl; + n -= vl; + } + } + sum = __riscv_vfmv_f_s_f32m1_f32(_sum); +#else + for (int i = 0; i < size; i++) + { + sum += ptr[i]; + //sqsum += ptr[i] * ptr[i]; + } +#endif // __riscv_zvfh + float mean = sum / size; +#if __riscv_zvfh && !defined(C906) + { + int n = size; + __fp16* ptr_sqsum = ptr; + while (n > 0) + { + size_t vl = __riscv_vsetvl_e32m8(n); + vfloat32m8_t _p = __riscv_vfwcvt_f_f_v_f32m8(__riscv_vle16_v_f16m4(ptr_sqsum, vl), vl); + _p = __riscv_vfsub_vf_f32m8(_p, mean, vl); + _sqsum = __riscv_vfredosum_vs_f32m8_f32m1(__riscv_vfmul_vv_f32m8(_p, _p, vl), /* scalar */ _sqsum, vl); + n -= vl; + ptr_sqsum += vl; + } + } + sqsum = __riscv_vfmv_f_s_f32m1_f32(_sqsum); +#else + float tmp = 0.f; + for (int i = 0; i < size; i++) + { + tmp = ptr[i] - mean; + sqsum += tmp * tmp; + } +#endif // __riscv_zvfh + float var = sqsum / size; + // the var maybe minus due to accuracy + //float var = sqsum / size - mean * mean; + + float a; + float b; + if (affine) + { + float gamma = gamma_data[q]; + float beta = beta_data[q]; + + a = gamma / (sqrtf(var + eps)); + b = -mean * a + beta; + } + else + { + a = 1.f / (sqrtf(var + eps)); + b = -mean * a; + } +#if __riscv_zvfh + { + int n = size; + __fp16* ptr_store = ptr; + while (n > 0) + { + size_t vl = __riscv_vsetvl_e32m8(n); + vfloat32m8_t _p = __riscv_vfwcvt_f_f_v_f32m8(__riscv_vle16_v_f16m4(ptr_store, vl), vl); + _p = __riscv_vfmul_vf_f32m8(_p, a, vl); + _p = __riscv_vfadd_vf_f32m8(_p, b, vl); + __riscv_vse16_v_f16m4(ptr_store, __riscv_vfncvt_f_f_w_f16m4(_p, vl), vl); + n -= vl; + ptr_store += vl; + } + } +#else + for (int i = 0; i < size; i++) + { + ptr[i] = ptr[i] * a + b; + } +#endif // __riscv_zvfh + } + return 0; + } + +#if __riscv_zvfh + const int packn = csrr_vlenb() / 2; + if (elempack == packn) + { + const size_t vl = __riscv_vsetvl_e16m1(packn); + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < c; q++) + { + __fp16* ptr = bottom_top_blob.channel(q); + vfloat32m2_t _sum = __riscv_vfmv_v_f_f32m2(0.f, vl); + vfloat32m2_t _sqsum = __riscv_vfmv_v_f_f32m2(0.f, vl); + + for (int i = 0; i < size; i++) + { + vfloat32m2_t _p = __riscv_vfwcvt_f_f_v_f32m2(__riscv_vle16_v_f16m1(ptr + vl * i, vl), vl); + _sum = __riscv_vfadd_vv_f32m2(_p, _sum, vl); + // _sqsum = __riscv_vfmadd_vv_f32m2(_p,_p,_sqsum,vl); + } + vfloat32m2_t _mean = __riscv_vfdiv_vf_f32m2(_sum, size, vl); + for (int i = 0; i < size; i++) + { + vfloat32m2_t _p = __riscv_vfwcvt_f_f_v_f32m2(__riscv_vle16_v_f16m1(ptr + vl * i, vl), vl); + _p = __riscv_vfsub_vv_f32m2(_p, _mean, vl); + _sqsum = __riscv_vfmadd_vv_f32m2(_p, _p, _sqsum, vl); + } + vfloat32m2_t _var = __riscv_vfdiv_vf_f32m2(_sqsum, size, vl); + // the var maybe minus due to accuracy + //float var = sqsum / size - mean * mean; + + vfloat32m2_t _a; + vfloat32m2_t _b; + if (affine) + { + vfloat32m2_t _gamma = __riscv_vle32_v_f32m2((const float*)gamma_data + q * vl, vl); + vfloat32m2_t _beta = __riscv_vle32_v_f32m2((const float*)beta_data + q * vl, vl); + _a = __riscv_vfdiv_vv_f32m2(_gamma, __riscv_vfsqrt_v_f32m2(__riscv_vfadd_vf_f32m2(_var, eps, vl), vl), vl); + _b = __riscv_vfnmsub_vv_f32m2(_a, _mean, _beta, vl); + } + else + { + _a = __riscv_vfrdiv_vf_f32m2(__riscv_vfsqrt_v_f32m2(__riscv_vfadd_vf_f32m2(_var, eps, vl), vl), 1.f, vl); + _b = __riscv_vfmul_vv_f32m2(_a, _mean, vl); + _b = __riscv_vfsgnjn_vv_f32m2(_b, _b, vl); + } + for (int i = 0; i < size; i++) + { + vfloat32m2_t _p = __riscv_vfwcvt_f_f_v_f32m2(__riscv_vle16_v_f16m1(ptr + i * vl, vl), vl); + _p = __riscv_vfmadd_vv_f32m2(_p, _a, _b, vl); + __riscv_vse16_v_f16m1(ptr + i * vl, __riscv_vfncvt_f_f_w_f16m1(_p, vl), vl); + } + } + return 0; + } +#endif // __riscv_zvfh + return 0; +} + +int InstanceNorm_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const +{ + // x = (x - mean) / (sqrt(var + eps)) * gamma + beta + int elempack = bottom_top_blob.elempack; + + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + int c = bottom_top_blob.c; + int size = w * h; + + int dims = bottom_top_blob.dims; + if (elempack == 1) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < c; q++) + { + __fp16* ptr = bottom_top_blob.channel(q); + + // mean and var + __fp16 sum = 0.f; + __fp16 sqsum = 0.f; +#if __riscv_zvfh && !defined(C906) + vfloat16m1_t _sum = __riscv_vfmv_s_f_f16m1(0.f, __riscv_vsetvlmax_e32m1()); + vfloat16m1_t _sqsum = __riscv_vfmv_s_f_f16m1(0.f, __riscv_vsetvlmax_e32m1()); + { + int n = size; + __fp16* ptr_sum = ptr; + while (n > 0) + { + size_t vl = __riscv_vsetvl_e16m8(n); + vfloat16m8_t _p = __riscv_vle16_v_f16m8(ptr_sum, vl); + _sum = __riscv_vfredusum_vs_f16m8_f16m1(_p, /* scalar */ _sum, vl); + // _sqsum = __riscv_vfredosum_vs_f16m8_f16m1(__riscv_vfmul_vv_f16m8(_p, _p, vl), /* scalar */ _sqsum, vl); + ptr_sum += vl; + n -= vl; + } + } + sum = __riscv_vfmv_f_s_f16m1_f16(_sum); +#else + for (int i = 0; i < size; i++) + { + sum += ptr[i]; + //sqsum += ptr[i] * ptr[i]; + } +#endif // __riscv_zvfh + __fp16 mean = sum / size; +#if __riscv_zvfh && !defined(C906) + { + int n = size; + __fp16* ptr_sqsum = ptr; + while (n > 0) + { + size_t vl = __riscv_vsetvl_e16m8(n); + vfloat16m8_t _p = __riscv_vle16_v_f16m8(ptr_sqsum, vl); + _p = __riscv_vfsub_vf_f16m8(_p, mean, vl); + _sqsum = __riscv_vfredosum_vs_f16m8_f16m1(__riscv_vfmul_vv_f16m8(_p, _p, vl), /* scalar */ _sqsum, vl); + n -= vl; + ptr_sqsum += vl; + } + } + sqsum = __riscv_vfmv_f_s_f16m1_f16(_sqsum); +#else + float tmp = 0.f; + for (int i = 0; i < size; i++) + { + tmp = ptr[i] - mean; + sqsum += tmp * tmp; + } +#endif // __riscv_zvfh + __fp16 var = sqsum / size; + // the var maybe minus due to accuracy + //float var = sqsum / size - mean * mean; + + __fp16 a; + __fp16 b; + if (affine) + { + float gamma = gamma_data[q]; + float beta = beta_data[q]; + + a = static_cast<__fp16>(gamma / (sqrt(var + eps))); + b = static_cast<__fp16>(-mean * a + beta); + } + else + { + a = static_cast<__fp16>(1.f / (sqrt(var + eps))); + b = static_cast<__fp16>(-mean * a); + } +#if __riscv_zvfh + { + int n = size; + __fp16* ptr_store = ptr; + while (n > 0) + { + size_t vl = __riscv_vsetvl_e32m8(n); + vfloat16m8_t _p = __riscv_vle16_v_f16m8(ptr_store, vl); + _p = __riscv_vfmul_vf_f16m8(_p, a, vl); + _p = __riscv_vfadd_vf_f16m8(_p, b, vl); + __riscv_vse16_v_f16m8(ptr_store, _p, vl); + n -= vl; + ptr_store += vl; + } + } +#else + for (int i = 0; i < size; i++) + { + ptr[i] = ptr[i] * a + b; + } +#endif // __riscv_zvfh + } + return 0; + } + +#if __riscv_zvfh + const int packn = csrr_vlenb() / 2; + if (elempack == packn) + { + const size_t vl = __riscv_vsetvl_e16m1(packn); + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < c; q++) + { + __fp16* ptr = bottom_top_blob.channel(q); + vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _sqsum = __riscv_vfmv_v_f_f16m1(0.f, vl); + + for (int i = 0; i < size; i++) + { + vfloat16m1_t _p = __riscv_vle16_v_f16m1(ptr + vl * i, vl); + _sum = __riscv_vfadd_vv_f16m1(_p, _sum, vl); + // _sqsum = __riscv_vfmadd_vv_f16m1(_p,_p,_sqsum,vl); + } + vfloat16m1_t _mean = __riscv_vfdiv_vf_f16m1(_sum, size, vl); + for (int i = 0; i < size; i++) + { + vfloat16m1_t _p = __riscv_vle16_v_f16m1(ptr + vl * i, vl); + _p = __riscv_vfsub_vv_f16m1(_p, _mean, vl); + _sqsum = __riscv_vfmadd_vv_f16m1(_p, _p, _sqsum, vl); + } + vfloat16m1_t _var = __riscv_vfdiv_vf_f16m1(_sqsum, size, vl); + // the var maybe minus due to accuracy + //float var = sqsum / size - mean * mean; + + vfloat16m1_t _a; + vfloat16m1_t _b; + if (affine) + { + vfloat16m1_t _gamma = __riscv_vfncvt_f_f_w_f16m1(__riscv_vle32_v_f32m2((const float*)gamma_data + q * vl, vl), vl); + vfloat16m1_t _beta = __riscv_vfncvt_f_f_w_f16m1(__riscv_vle32_v_f32m2((const float*)beta_data + q * vl, vl), vl); + _a = __riscv_vfdiv_vv_f16m1(_gamma, __riscv_vfsqrt_v_f16m1(__riscv_vfadd_vf_f16m1(_var, eps, vl), vl), vl); + _b = __riscv_vfnmsub_vv_f16m1(_a, _mean, _beta, vl); + } + else + { + _a = __riscv_vfrdiv_vf_f16m1(__riscv_vfsqrt_v_f16m1(__riscv_vfadd_vf_f16m1(_var, eps, vl), vl), 1.f, vl); + _b = __riscv_vfmul_vv_f16m1(_a, _mean, vl); + _b = __riscv_vfsgnjn_vv_f16m1(_b, _b, vl); + } + for (int i = 0; i < size; i++) + { + vfloat16m1_t _p = __riscv_vle16_v_f16m1(ptr + i * vl, vl); + _p = __riscv_vfmadd_vv_f16m1(_p, _a, _b, vl); + __riscv_vse16_v_f16m1(ptr + i * vl, _p, vl); + } + } + return 0; + } +#endif // __riscv_zvfh + return 0; +} +#endif // NCNN_ZFH + +} // namespace ncnn diff --git a/src/layer/riscv/interp_bicubic_packn.h b/src/layer/riscv/interp_bicubic_packn.h index b19af95f0f8..c53eb74ab85 100644 --- a/src/layer/riscv/interp_bicubic_packn.h +++ b/src/layer/riscv/interp_bicubic_packn.h @@ -15,7 +15,7 @@ static void resize_bicubic_image_packn(const Mat& src, Mat& dst, float* alpha, int* xofs, float* beta, int* yofs) { const int packn = csrr_vlenb() / 4; - const size_t vl = vsetvl_e32m1(packn); + const size_t vl = __riscv_vsetvl_e32m1(packn); int w = dst.w; int h = dst.h; @@ -57,13 +57,13 @@ static void resize_bicubic_image_packn(const Mat& src, Mat& dst, float* alpha, i int sx = xofs[dx] * packn; const float* S3p = S3 + sx; - vfloat32m1_t _S30 = vle32_v_f32m1(S3p - packn, vl); - vfloat32m1_t _S31 = vle32_v_f32m1(S3p, vl); - vfloat32m1_t _S32 = vle32_v_f32m1(S3p + packn, vl); - vfloat32m1_t _S33 = vle32_v_f32m1(S3p + packn * 2, vl); - vfloat32m1_t _rows3 = vfmacc_vf_f32m1(vfmacc_vf_f32m1(vfmacc_vf_f32m1(vfmul_vf_f32m1(_S30, alphap[0], vl), alphap[1], _S31, vl), alphap[2], _S32, vl), alphap[3], _S33, vl); + vfloat32m1_t _S30 = __riscv_vle32_v_f32m1(S3p - packn, vl); + vfloat32m1_t _S31 = __riscv_vle32_v_f32m1(S3p, vl); + vfloat32m1_t _S32 = __riscv_vle32_v_f32m1(S3p + packn, vl); + vfloat32m1_t _S33 = __riscv_vle32_v_f32m1(S3p + packn * 2, vl); + vfloat32m1_t _rows3 = __riscv_vfmacc_vf_f32m1(__riscv_vfmacc_vf_f32m1(__riscv_vfmacc_vf_f32m1(__riscv_vfmul_vf_f32m1(_S30, alphap[0], vl), alphap[1], _S31, vl), alphap[2], _S32, vl), alphap[3], _S33, vl); - vse32_v_f32m1(rows3p + dx * packn, _rows3, vl); + __riscv_vse32_v_f32m1(rows3p + dx * packn, _rows3, vl); alphap += 4; } @@ -89,19 +89,19 @@ static void resize_bicubic_image_packn(const Mat& src, Mat& dst, float* alpha, i const float* S2p = S2 + sx; const float* S3p = S3 + sx; - vfloat32m1_t _S20 = vle32_v_f32m1(S2p - packn, vl); - vfloat32m1_t _S21 = vle32_v_f32m1(S2p, vl); - vfloat32m1_t _S22 = vle32_v_f32m1(S2p + packn, vl); - vfloat32m1_t _S23 = vle32_v_f32m1(S2p + packn * 2, vl); - vfloat32m1_t _S30 = vle32_v_f32m1(S3p - packn, vl); - vfloat32m1_t _S31 = vle32_v_f32m1(S3p, vl); - vfloat32m1_t _S32 = vle32_v_f32m1(S3p + packn, vl); - vfloat32m1_t _S33 = vle32_v_f32m1(S3p + packn * 2, vl); - vfloat32m1_t _rows2 = vfmacc_vf_f32m1(vfmacc_vf_f32m1(vfmacc_vf_f32m1(vfmul_vf_f32m1(_S20, alphap[0], vl), alphap[1], _S21, vl), alphap[2], _S22, vl), alphap[3], _S23, vl); - vfloat32m1_t _rows3 = vfmacc_vf_f32m1(vfmacc_vf_f32m1(vfmacc_vf_f32m1(vfmul_vf_f32m1(_S30, alphap[0], vl), alphap[1], _S31, vl), alphap[2], _S32, vl), alphap[3], _S33, vl); + vfloat32m1_t _S20 = __riscv_vle32_v_f32m1(S2p - packn, vl); + vfloat32m1_t _S21 = __riscv_vle32_v_f32m1(S2p, vl); + vfloat32m1_t _S22 = __riscv_vle32_v_f32m1(S2p + packn, vl); + vfloat32m1_t _S23 = __riscv_vle32_v_f32m1(S2p + packn * 2, vl); + vfloat32m1_t _S30 = __riscv_vle32_v_f32m1(S3p - packn, vl); + vfloat32m1_t _S31 = __riscv_vle32_v_f32m1(S3p, vl); + vfloat32m1_t _S32 = __riscv_vle32_v_f32m1(S3p + packn, vl); + vfloat32m1_t _S33 = __riscv_vle32_v_f32m1(S3p + packn * 2, vl); + vfloat32m1_t _rows2 = __riscv_vfmacc_vf_f32m1(__riscv_vfmacc_vf_f32m1(__riscv_vfmacc_vf_f32m1(__riscv_vfmul_vf_f32m1(_S20, alphap[0], vl), alphap[1], _S21, vl), alphap[2], _S22, vl), alphap[3], _S23, vl); + vfloat32m1_t _rows3 = __riscv_vfmacc_vf_f32m1(__riscv_vfmacc_vf_f32m1(__riscv_vfmacc_vf_f32m1(__riscv_vfmul_vf_f32m1(_S30, alphap[0], vl), alphap[1], _S31, vl), alphap[2], _S32, vl), alphap[3], _S33, vl); - vse32_v_f32m1(rows2p + dx * packn, _rows2, vl); - vse32_v_f32m1(rows3p + dx * packn, _rows3, vl); + __riscv_vse32_v_f32m1(rows2p + dx * packn, _rows2, vl); + __riscv_vse32_v_f32m1(rows3p + dx * packn, _rows3, vl); alphap += 4; } @@ -131,25 +131,25 @@ static void resize_bicubic_image_packn(const Mat& src, Mat& dst, float* alpha, i const float* S2p = S2 + sx; const float* S3p = S3 + sx; - vfloat32m1_t _S10 = vle32_v_f32m1(S1p - packn, vl); - vfloat32m1_t _S11 = vle32_v_f32m1(S1p, vl); - vfloat32m1_t _S12 = vle32_v_f32m1(S1p + packn, vl); - vfloat32m1_t _S13 = vle32_v_f32m1(S1p + packn * 2, vl); - vfloat32m1_t _S20 = vle32_v_f32m1(S2p - packn, vl); - vfloat32m1_t _S21 = vle32_v_f32m1(S2p, vl); - vfloat32m1_t _S22 = vle32_v_f32m1(S2p + packn, vl); - vfloat32m1_t _S23 = vle32_v_f32m1(S2p + packn * 2, vl); - vfloat32m1_t _S30 = vle32_v_f32m1(S3p - packn, vl); - vfloat32m1_t _S31 = vle32_v_f32m1(S3p, vl); - vfloat32m1_t _S32 = vle32_v_f32m1(S3p + packn, vl); - vfloat32m1_t _S33 = vle32_v_f32m1(S3p + packn * 2, vl); - vfloat32m1_t _rows1 = vfmacc_vf_f32m1(vfmacc_vf_f32m1(vfmacc_vf_f32m1(vfmul_vf_f32m1(_S10, alphap[0], vl), alphap[1], _S11, vl), alphap[2], _S12, vl), alphap[3], _S13, vl); - vfloat32m1_t _rows2 = vfmacc_vf_f32m1(vfmacc_vf_f32m1(vfmacc_vf_f32m1(vfmul_vf_f32m1(_S20, alphap[0], vl), alphap[1], _S21, vl), alphap[2], _S22, vl), alphap[3], _S23, vl); - vfloat32m1_t _rows3 = vfmacc_vf_f32m1(vfmacc_vf_f32m1(vfmacc_vf_f32m1(vfmul_vf_f32m1(_S30, alphap[0], vl), alphap[1], _S31, vl), alphap[2], _S32, vl), alphap[3], _S33, vl); - - vse32_v_f32m1(rows1p + dx * packn, _rows1, vl); - vse32_v_f32m1(rows2p + dx * packn, _rows2, vl); - vse32_v_f32m1(rows3p + dx * packn, _rows3, vl); + vfloat32m1_t _S10 = __riscv_vle32_v_f32m1(S1p - packn, vl); + vfloat32m1_t _S11 = __riscv_vle32_v_f32m1(S1p, vl); + vfloat32m1_t _S12 = __riscv_vle32_v_f32m1(S1p + packn, vl); + vfloat32m1_t _S13 = __riscv_vle32_v_f32m1(S1p + packn * 2, vl); + vfloat32m1_t _S20 = __riscv_vle32_v_f32m1(S2p - packn, vl); + vfloat32m1_t _S21 = __riscv_vle32_v_f32m1(S2p, vl); + vfloat32m1_t _S22 = __riscv_vle32_v_f32m1(S2p + packn, vl); + vfloat32m1_t _S23 = __riscv_vle32_v_f32m1(S2p + packn * 2, vl); + vfloat32m1_t _S30 = __riscv_vle32_v_f32m1(S3p - packn, vl); + vfloat32m1_t _S31 = __riscv_vle32_v_f32m1(S3p, vl); + vfloat32m1_t _S32 = __riscv_vle32_v_f32m1(S3p + packn, vl); + vfloat32m1_t _S33 = __riscv_vle32_v_f32m1(S3p + packn * 2, vl); + vfloat32m1_t _rows1 = __riscv_vfmacc_vf_f32m1(__riscv_vfmacc_vf_f32m1(__riscv_vfmacc_vf_f32m1(__riscv_vfmul_vf_f32m1(_S10, alphap[0], vl), alphap[1], _S11, vl), alphap[2], _S12, vl), alphap[3], _S13, vl); + vfloat32m1_t _rows2 = __riscv_vfmacc_vf_f32m1(__riscv_vfmacc_vf_f32m1(__riscv_vfmacc_vf_f32m1(__riscv_vfmul_vf_f32m1(_S20, alphap[0], vl), alphap[1], _S21, vl), alphap[2], _S22, vl), alphap[3], _S23, vl); + vfloat32m1_t _rows3 = __riscv_vfmacc_vf_f32m1(__riscv_vfmacc_vf_f32m1(__riscv_vfmacc_vf_f32m1(__riscv_vfmul_vf_f32m1(_S30, alphap[0], vl), alphap[1], _S31, vl), alphap[2], _S32, vl), alphap[3], _S33, vl); + + __riscv_vse32_v_f32m1(rows1p + dx * packn, _rows1, vl); + __riscv_vse32_v_f32m1(rows2p + dx * packn, _rows2, vl); + __riscv_vse32_v_f32m1(rows3p + dx * packn, _rows3, vl); alphap += 4; } @@ -175,31 +175,31 @@ static void resize_bicubic_image_packn(const Mat& src, Mat& dst, float* alpha, i const float* S2p = S2 + sx; const float* S3p = S3 + sx; - vfloat32m1_t _S00 = vle32_v_f32m1(S0p - packn, vl); - vfloat32m1_t _S01 = vle32_v_f32m1(S0p, vl); - vfloat32m1_t _S02 = vle32_v_f32m1(S0p + packn, vl); - vfloat32m1_t _S03 = vle32_v_f32m1(S0p + packn * 2, vl); - vfloat32m1_t _S10 = vle32_v_f32m1(S1p - packn, vl); - vfloat32m1_t _S11 = vle32_v_f32m1(S1p, vl); - vfloat32m1_t _S12 = vle32_v_f32m1(S1p + packn, vl); - vfloat32m1_t _S13 = vle32_v_f32m1(S1p + packn * 2, vl); - vfloat32m1_t _S20 = vle32_v_f32m1(S2p - packn, vl); - vfloat32m1_t _S21 = vle32_v_f32m1(S2p, vl); - vfloat32m1_t _S22 = vle32_v_f32m1(S2p + packn, vl); - vfloat32m1_t _S23 = vle32_v_f32m1(S2p + packn * 2, vl); - vfloat32m1_t _S30 = vle32_v_f32m1(S3p - packn, vl); - vfloat32m1_t _S31 = vle32_v_f32m1(S3p, vl); - vfloat32m1_t _S32 = vle32_v_f32m1(S3p + packn, vl); - vfloat32m1_t _S33 = vle32_v_f32m1(S3p + packn * 2, vl); - vfloat32m1_t _rows0 = vfmacc_vf_f32m1(vfmacc_vf_f32m1(vfmacc_vf_f32m1(vfmul_vf_f32m1(_S00, alphap[0], vl), alphap[1], _S01, vl), alphap[2], _S02, vl), alphap[3], _S03, vl); - vfloat32m1_t _rows1 = vfmacc_vf_f32m1(vfmacc_vf_f32m1(vfmacc_vf_f32m1(vfmul_vf_f32m1(_S10, alphap[0], vl), alphap[1], _S11, vl), alphap[2], _S12, vl), alphap[3], _S13, vl); - vfloat32m1_t _rows2 = vfmacc_vf_f32m1(vfmacc_vf_f32m1(vfmacc_vf_f32m1(vfmul_vf_f32m1(_S20, alphap[0], vl), alphap[1], _S21, vl), alphap[2], _S22, vl), alphap[3], _S23, vl); - vfloat32m1_t _rows3 = vfmacc_vf_f32m1(vfmacc_vf_f32m1(vfmacc_vf_f32m1(vfmul_vf_f32m1(_S30, alphap[0], vl), alphap[1], _S31, vl), alphap[2], _S32, vl), alphap[3], _S33, vl); - - vse32_v_f32m1(rows0p + dx * packn, _rows0, vl); - vse32_v_f32m1(rows1p + dx * packn, _rows1, vl); - vse32_v_f32m1(rows2p + dx * packn, _rows2, vl); - vse32_v_f32m1(rows3p + dx * packn, _rows3, vl); + vfloat32m1_t _S00 = __riscv_vle32_v_f32m1(S0p - packn, vl); + vfloat32m1_t _S01 = __riscv_vle32_v_f32m1(S0p, vl); + vfloat32m1_t _S02 = __riscv_vle32_v_f32m1(S0p + packn, vl); + vfloat32m1_t _S03 = __riscv_vle32_v_f32m1(S0p + packn * 2, vl); + vfloat32m1_t _S10 = __riscv_vle32_v_f32m1(S1p - packn, vl); + vfloat32m1_t _S11 = __riscv_vle32_v_f32m1(S1p, vl); + vfloat32m1_t _S12 = __riscv_vle32_v_f32m1(S1p + packn, vl); + vfloat32m1_t _S13 = __riscv_vle32_v_f32m1(S1p + packn * 2, vl); + vfloat32m1_t _S20 = __riscv_vle32_v_f32m1(S2p - packn, vl); + vfloat32m1_t _S21 = __riscv_vle32_v_f32m1(S2p, vl); + vfloat32m1_t _S22 = __riscv_vle32_v_f32m1(S2p + packn, vl); + vfloat32m1_t _S23 = __riscv_vle32_v_f32m1(S2p + packn * 2, vl); + vfloat32m1_t _S30 = __riscv_vle32_v_f32m1(S3p - packn, vl); + vfloat32m1_t _S31 = __riscv_vle32_v_f32m1(S3p, vl); + vfloat32m1_t _S32 = __riscv_vle32_v_f32m1(S3p + packn, vl); + vfloat32m1_t _S33 = __riscv_vle32_v_f32m1(S3p + packn * 2, vl); + vfloat32m1_t _rows0 = __riscv_vfmacc_vf_f32m1(__riscv_vfmacc_vf_f32m1(__riscv_vfmacc_vf_f32m1(__riscv_vfmul_vf_f32m1(_S00, alphap[0], vl), alphap[1], _S01, vl), alphap[2], _S02, vl), alphap[3], _S03, vl); + vfloat32m1_t _rows1 = __riscv_vfmacc_vf_f32m1(__riscv_vfmacc_vf_f32m1(__riscv_vfmacc_vf_f32m1(__riscv_vfmul_vf_f32m1(_S10, alphap[0], vl), alphap[1], _S11, vl), alphap[2], _S12, vl), alphap[3], _S13, vl); + vfloat32m1_t _rows2 = __riscv_vfmacc_vf_f32m1(__riscv_vfmacc_vf_f32m1(__riscv_vfmacc_vf_f32m1(__riscv_vfmul_vf_f32m1(_S20, alphap[0], vl), alphap[1], _S21, vl), alphap[2], _S22, vl), alphap[3], _S23, vl); + vfloat32m1_t _rows3 = __riscv_vfmacc_vf_f32m1(__riscv_vfmacc_vf_f32m1(__riscv_vfmacc_vf_f32m1(__riscv_vfmul_vf_f32m1(_S30, alphap[0], vl), alphap[1], _S31, vl), alphap[2], _S32, vl), alphap[3], _S33, vl); + + __riscv_vse32_v_f32m1(rows0p + dx * packn, _rows0, vl); + __riscv_vse32_v_f32m1(rows1p + dx * packn, _rows1, vl); + __riscv_vse32_v_f32m1(rows2p + dx * packn, _rows2, vl); + __riscv_vse32_v_f32m1(rows3p + dx * packn, _rows3, vl); alphap += 4; } @@ -221,14 +221,14 @@ static void resize_bicubic_image_packn(const Mat& src, Mat& dst, float* alpha, i for (int dx = 0; dx < w; dx++) { - vfloat32m1_t _rows0 = vle32_v_f32m1(rows0p, vl); - vfloat32m1_t _rows1 = vle32_v_f32m1(rows1p, vl); - vfloat32m1_t _rows2 = vle32_v_f32m1(rows2p, vl); - vfloat32m1_t _rows3 = vle32_v_f32m1(rows3p, vl); + vfloat32m1_t _rows0 = __riscv_vle32_v_f32m1(rows0p, vl); + vfloat32m1_t _rows1 = __riscv_vle32_v_f32m1(rows1p, vl); + vfloat32m1_t _rows2 = __riscv_vle32_v_f32m1(rows2p, vl); + vfloat32m1_t _rows3 = __riscv_vle32_v_f32m1(rows3p, vl); - vfloat32m1_t _Dp = vfmacc_vf_f32m1(vfmacc_vf_f32m1(vfmacc_vf_f32m1(vfmul_vf_f32m1(_rows0, b0, vl), b1, _rows1, vl), b2, _rows2, vl), b3, _rows3, vl); + vfloat32m1_t _Dp = __riscv_vfmacc_vf_f32m1(__riscv_vfmacc_vf_f32m1(__riscv_vfmacc_vf_f32m1(__riscv_vfmul_vf_f32m1(_rows0, b0, vl), b1, _rows1, vl), b2, _rows2, vl), b3, _rows3, vl); - vse32_v_f32m1(Dp, _Dp, vl); + __riscv_vse32_v_f32m1(Dp, _Dp, vl); Dp += packn; rows0p += packn; diff --git a/src/layer/riscv/interp_bicubic_packn_fp16s.h b/src/layer/riscv/interp_bicubic_packn_fp16s.h index f87d5bb5a4c..7e9c1dc0235 100644 --- a/src/layer/riscv/interp_bicubic_packn_fp16s.h +++ b/src/layer/riscv/interp_bicubic_packn_fp16s.h @@ -15,7 +15,7 @@ static void resize_bicubic_image_packn_fp16s(const Mat& src, Mat& dst, float* alpha, int* xofs, float* beta, int* yofs) { const int packn = csrr_vlenb() / 2; - const size_t vl = vsetvl_e16m1(packn); + const size_t vl = __riscv_vsetvl_e16m1(packn); int w = dst.w; int h = dst.h; @@ -57,13 +57,13 @@ static void resize_bicubic_image_packn_fp16s(const Mat& src, Mat& dst, float* al int sx = xofs[dx] * packn; const __fp16* S3p = S3 + sx; - vfloat16m1_t _S30 = vle16_v_f16m1(S3p - packn, vl); - vfloat16m1_t _S31 = vle16_v_f16m1(S3p, vl); - vfloat16m1_t _S32 = vle16_v_f16m1(S3p + packn, vl); - vfloat16m1_t _S33 = vle16_v_f16m1(S3p + packn * 2, vl); - vfloat32m2_t _rows3 = vfwmacc_vf_f32m2(vfwmacc_vf_f32m2(vfwmacc_vf_f32m2(vfwmul_vf_f32m2(_S30, alphap[0], vl), alphap[1], _S31, vl), alphap[2], _S32, vl), alphap[3], _S33, vl); + vfloat16m1_t _S30 = __riscv_vle16_v_f16m1(S3p - packn, vl); + vfloat16m1_t _S31 = __riscv_vle16_v_f16m1(S3p, vl); + vfloat16m1_t _S32 = __riscv_vle16_v_f16m1(S3p + packn, vl); + vfloat16m1_t _S33 = __riscv_vle16_v_f16m1(S3p + packn * 2, vl); + vfloat32m2_t _rows3 = __riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmul_vf_f32m2(_S30, alphap[0], vl), alphap[1], _S31, vl), alphap[2], _S32, vl), alphap[3], _S33, vl); - vse32_v_f32m2(rows3p + dx * packn, _rows3, vl); + __riscv_vse32_v_f32m2(rows3p + dx * packn, _rows3, vl); alphap += 4; } @@ -89,19 +89,19 @@ static void resize_bicubic_image_packn_fp16s(const Mat& src, Mat& dst, float* al const __fp16* S2p = S2 + sx; const __fp16* S3p = S3 + sx; - vfloat16m1_t _S20 = vle16_v_f16m1(S2p - packn, vl); - vfloat16m1_t _S21 = vle16_v_f16m1(S2p, vl); - vfloat16m1_t _S22 = vle16_v_f16m1(S2p + packn, vl); - vfloat16m1_t _S23 = vle16_v_f16m1(S2p + packn * 2, vl); - vfloat16m1_t _S30 = vle16_v_f16m1(S3p - packn, vl); - vfloat16m1_t _S31 = vle16_v_f16m1(S3p, vl); - vfloat16m1_t _S32 = vle16_v_f16m1(S3p + packn, vl); - vfloat16m1_t _S33 = vle16_v_f16m1(S3p + packn * 2, vl); - vfloat32m2_t _rows2 = vfwmacc_vf_f32m2(vfwmacc_vf_f32m2(vfwmacc_vf_f32m2(vfwmul_vf_f32m2(_S20, alphap[0], vl), alphap[1], _S21, vl), alphap[2], _S22, vl), alphap[3], _S23, vl); - vfloat32m2_t _rows3 = vfwmacc_vf_f32m2(vfwmacc_vf_f32m2(vfwmacc_vf_f32m2(vfwmul_vf_f32m2(_S30, alphap[0], vl), alphap[1], _S31, vl), alphap[2], _S32, vl), alphap[3], _S33, vl); + vfloat16m1_t _S20 = __riscv_vle16_v_f16m1(S2p - packn, vl); + vfloat16m1_t _S21 = __riscv_vle16_v_f16m1(S2p, vl); + vfloat16m1_t _S22 = __riscv_vle16_v_f16m1(S2p + packn, vl); + vfloat16m1_t _S23 = __riscv_vle16_v_f16m1(S2p + packn * 2, vl); + vfloat16m1_t _S30 = __riscv_vle16_v_f16m1(S3p - packn, vl); + vfloat16m1_t _S31 = __riscv_vle16_v_f16m1(S3p, vl); + vfloat16m1_t _S32 = __riscv_vle16_v_f16m1(S3p + packn, vl); + vfloat16m1_t _S33 = __riscv_vle16_v_f16m1(S3p + packn * 2, vl); + vfloat32m2_t _rows2 = __riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmul_vf_f32m2(_S20, alphap[0], vl), alphap[1], _S21, vl), alphap[2], _S22, vl), alphap[3], _S23, vl); + vfloat32m2_t _rows3 = __riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmul_vf_f32m2(_S30, alphap[0], vl), alphap[1], _S31, vl), alphap[2], _S32, vl), alphap[3], _S33, vl); - vse32_v_f32m2(rows2p + dx * packn, _rows2, vl); - vse32_v_f32m2(rows3p + dx * packn, _rows3, vl); + __riscv_vse32_v_f32m2(rows2p + dx * packn, _rows2, vl); + __riscv_vse32_v_f32m2(rows3p + dx * packn, _rows3, vl); alphap += 4; } @@ -131,25 +131,25 @@ static void resize_bicubic_image_packn_fp16s(const Mat& src, Mat& dst, float* al const __fp16* S2p = S2 + sx; const __fp16* S3p = S3 + sx; - vfloat16m1_t _S10 = vle16_v_f16m1(S1p - packn, vl); - vfloat16m1_t _S11 = vle16_v_f16m1(S1p, vl); - vfloat16m1_t _S12 = vle16_v_f16m1(S1p + packn, vl); - vfloat16m1_t _S13 = vle16_v_f16m1(S1p + packn * 2, vl); - vfloat16m1_t _S20 = vle16_v_f16m1(S2p - packn, vl); - vfloat16m1_t _S21 = vle16_v_f16m1(S2p, vl); - vfloat16m1_t _S22 = vle16_v_f16m1(S2p + packn, vl); - vfloat16m1_t _S23 = vle16_v_f16m1(S2p + packn * 2, vl); - vfloat16m1_t _S30 = vle16_v_f16m1(S3p - packn, vl); - vfloat16m1_t _S31 = vle16_v_f16m1(S3p, vl); - vfloat16m1_t _S32 = vle16_v_f16m1(S3p + packn, vl); - vfloat16m1_t _S33 = vle16_v_f16m1(S3p + packn * 2, vl); - vfloat32m2_t _rows1 = vfwmacc_vf_f32m2(vfwmacc_vf_f32m2(vfwmacc_vf_f32m2(vfwmul_vf_f32m2(_S10, alphap[0], vl), alphap[1], _S11, vl), alphap[2], _S12, vl), alphap[3], _S13, vl); - vfloat32m2_t _rows2 = vfwmacc_vf_f32m2(vfwmacc_vf_f32m2(vfwmacc_vf_f32m2(vfwmul_vf_f32m2(_S20, alphap[0], vl), alphap[1], _S21, vl), alphap[2], _S22, vl), alphap[3], _S23, vl); - vfloat32m2_t _rows3 = vfwmacc_vf_f32m2(vfwmacc_vf_f32m2(vfwmacc_vf_f32m2(vfwmul_vf_f32m2(_S30, alphap[0], vl), alphap[1], _S31, vl), alphap[2], _S32, vl), alphap[3], _S33, vl); - - vse32_v_f32m2(rows1p + dx * packn, _rows1, vl); - vse32_v_f32m2(rows2p + dx * packn, _rows2, vl); - vse32_v_f32m2(rows3p + dx * packn, _rows3, vl); + vfloat16m1_t _S10 = __riscv_vle16_v_f16m1(S1p - packn, vl); + vfloat16m1_t _S11 = __riscv_vle16_v_f16m1(S1p, vl); + vfloat16m1_t _S12 = __riscv_vle16_v_f16m1(S1p + packn, vl); + vfloat16m1_t _S13 = __riscv_vle16_v_f16m1(S1p + packn * 2, vl); + vfloat16m1_t _S20 = __riscv_vle16_v_f16m1(S2p - packn, vl); + vfloat16m1_t _S21 = __riscv_vle16_v_f16m1(S2p, vl); + vfloat16m1_t _S22 = __riscv_vle16_v_f16m1(S2p + packn, vl); + vfloat16m1_t _S23 = __riscv_vle16_v_f16m1(S2p + packn * 2, vl); + vfloat16m1_t _S30 = __riscv_vle16_v_f16m1(S3p - packn, vl); + vfloat16m1_t _S31 = __riscv_vle16_v_f16m1(S3p, vl); + vfloat16m1_t _S32 = __riscv_vle16_v_f16m1(S3p + packn, vl); + vfloat16m1_t _S33 = __riscv_vle16_v_f16m1(S3p + packn * 2, vl); + vfloat32m2_t _rows1 = __riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmul_vf_f32m2(_S10, alphap[0], vl), alphap[1], _S11, vl), alphap[2], _S12, vl), alphap[3], _S13, vl); + vfloat32m2_t _rows2 = __riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmul_vf_f32m2(_S20, alphap[0], vl), alphap[1], _S21, vl), alphap[2], _S22, vl), alphap[3], _S23, vl); + vfloat32m2_t _rows3 = __riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmul_vf_f32m2(_S30, alphap[0], vl), alphap[1], _S31, vl), alphap[2], _S32, vl), alphap[3], _S33, vl); + + __riscv_vse32_v_f32m2(rows1p + dx * packn, _rows1, vl); + __riscv_vse32_v_f32m2(rows2p + dx * packn, _rows2, vl); + __riscv_vse32_v_f32m2(rows3p + dx * packn, _rows3, vl); alphap += 4; } @@ -175,31 +175,31 @@ static void resize_bicubic_image_packn_fp16s(const Mat& src, Mat& dst, float* al const __fp16* S2p = S2 + sx; const __fp16* S3p = S3 + sx; - vfloat16m1_t _S00 = vle16_v_f16m1(S0p - packn, vl); - vfloat16m1_t _S01 = vle16_v_f16m1(S0p, vl); - vfloat16m1_t _S02 = vle16_v_f16m1(S0p + packn, vl); - vfloat16m1_t _S03 = vle16_v_f16m1(S0p + packn * 2, vl); - vfloat16m1_t _S10 = vle16_v_f16m1(S1p - packn, vl); - vfloat16m1_t _S11 = vle16_v_f16m1(S1p, vl); - vfloat16m1_t _S12 = vle16_v_f16m1(S1p + packn, vl); - vfloat16m1_t _S13 = vle16_v_f16m1(S1p + packn * 2, vl); - vfloat16m1_t _S20 = vle16_v_f16m1(S2p - packn, vl); - vfloat16m1_t _S21 = vle16_v_f16m1(S2p, vl); - vfloat16m1_t _S22 = vle16_v_f16m1(S2p + packn, vl); - vfloat16m1_t _S23 = vle16_v_f16m1(S2p + packn * 2, vl); - vfloat16m1_t _S30 = vle16_v_f16m1(S3p - packn, vl); - vfloat16m1_t _S31 = vle16_v_f16m1(S3p, vl); - vfloat16m1_t _S32 = vle16_v_f16m1(S3p + packn, vl); - vfloat16m1_t _S33 = vle16_v_f16m1(S3p + packn * 2, vl); - vfloat32m2_t _rows0 = vfwmacc_vf_f32m2(vfwmacc_vf_f32m2(vfwmacc_vf_f32m2(vfwmul_vf_f32m2(_S00, alphap[0], vl), alphap[1], _S01, vl), alphap[2], _S02, vl), alphap[3], _S03, vl); - vfloat32m2_t _rows1 = vfwmacc_vf_f32m2(vfwmacc_vf_f32m2(vfwmacc_vf_f32m2(vfwmul_vf_f32m2(_S10, alphap[0], vl), alphap[1], _S11, vl), alphap[2], _S12, vl), alphap[3], _S13, vl); - vfloat32m2_t _rows2 = vfwmacc_vf_f32m2(vfwmacc_vf_f32m2(vfwmacc_vf_f32m2(vfwmul_vf_f32m2(_S20, alphap[0], vl), alphap[1], _S21, vl), alphap[2], _S22, vl), alphap[3], _S23, vl); - vfloat32m2_t _rows3 = vfwmacc_vf_f32m2(vfwmacc_vf_f32m2(vfwmacc_vf_f32m2(vfwmul_vf_f32m2(_S30, alphap[0], vl), alphap[1], _S31, vl), alphap[2], _S32, vl), alphap[3], _S33, vl); - - vse32_v_f32m2(rows0p + dx * packn, _rows0, vl); - vse32_v_f32m2(rows1p + dx * packn, _rows1, vl); - vse32_v_f32m2(rows2p + dx * packn, _rows2, vl); - vse32_v_f32m2(rows3p + dx * packn, _rows3, vl); + vfloat16m1_t _S00 = __riscv_vle16_v_f16m1(S0p - packn, vl); + vfloat16m1_t _S01 = __riscv_vle16_v_f16m1(S0p, vl); + vfloat16m1_t _S02 = __riscv_vle16_v_f16m1(S0p + packn, vl); + vfloat16m1_t _S03 = __riscv_vle16_v_f16m1(S0p + packn * 2, vl); + vfloat16m1_t _S10 = __riscv_vle16_v_f16m1(S1p - packn, vl); + vfloat16m1_t _S11 = __riscv_vle16_v_f16m1(S1p, vl); + vfloat16m1_t _S12 = __riscv_vle16_v_f16m1(S1p + packn, vl); + vfloat16m1_t _S13 = __riscv_vle16_v_f16m1(S1p + packn * 2, vl); + vfloat16m1_t _S20 = __riscv_vle16_v_f16m1(S2p - packn, vl); + vfloat16m1_t _S21 = __riscv_vle16_v_f16m1(S2p, vl); + vfloat16m1_t _S22 = __riscv_vle16_v_f16m1(S2p + packn, vl); + vfloat16m1_t _S23 = __riscv_vle16_v_f16m1(S2p + packn * 2, vl); + vfloat16m1_t _S30 = __riscv_vle16_v_f16m1(S3p - packn, vl); + vfloat16m1_t _S31 = __riscv_vle16_v_f16m1(S3p, vl); + vfloat16m1_t _S32 = __riscv_vle16_v_f16m1(S3p + packn, vl); + vfloat16m1_t _S33 = __riscv_vle16_v_f16m1(S3p + packn * 2, vl); + vfloat32m2_t _rows0 = __riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmul_vf_f32m2(_S00, alphap[0], vl), alphap[1], _S01, vl), alphap[2], _S02, vl), alphap[3], _S03, vl); + vfloat32m2_t _rows1 = __riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmul_vf_f32m2(_S10, alphap[0], vl), alphap[1], _S11, vl), alphap[2], _S12, vl), alphap[3], _S13, vl); + vfloat32m2_t _rows2 = __riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmul_vf_f32m2(_S20, alphap[0], vl), alphap[1], _S21, vl), alphap[2], _S22, vl), alphap[3], _S23, vl); + vfloat32m2_t _rows3 = __riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmul_vf_f32m2(_S30, alphap[0], vl), alphap[1], _S31, vl), alphap[2], _S32, vl), alphap[3], _S33, vl); + + __riscv_vse32_v_f32m2(rows0p + dx * packn, _rows0, vl); + __riscv_vse32_v_f32m2(rows1p + dx * packn, _rows1, vl); + __riscv_vse32_v_f32m2(rows2p + dx * packn, _rows2, vl); + __riscv_vse32_v_f32m2(rows3p + dx * packn, _rows3, vl); alphap += 4; } @@ -221,14 +221,14 @@ static void resize_bicubic_image_packn_fp16s(const Mat& src, Mat& dst, float* al for (int dx = 0; dx < w; dx++) { - vfloat32m2_t _rows0 = vle32_v_f32m2(rows0p, vl); - vfloat32m2_t _rows1 = vle32_v_f32m2(rows1p, vl); - vfloat32m2_t _rows2 = vle32_v_f32m2(rows2p, vl); - vfloat32m2_t _rows3 = vle32_v_f32m2(rows3p, vl); + vfloat32m2_t _rows0 = __riscv_vle32_v_f32m2(rows0p, vl); + vfloat32m2_t _rows1 = __riscv_vle32_v_f32m2(rows1p, vl); + vfloat32m2_t _rows2 = __riscv_vle32_v_f32m2(rows2p, vl); + vfloat32m2_t _rows3 = __riscv_vle32_v_f32m2(rows3p, vl); - vfloat32m2_t _Dp = vfmacc_vf_f32m2(vfmacc_vf_f32m2(vfmacc_vf_f32m2(vfmul_vf_f32m2(_rows0, b0, vl), b1, _rows1, vl), b2, _rows2, vl), b3, _rows3, vl); + vfloat32m2_t _Dp = __riscv_vfmacc_vf_f32m2(__riscv_vfmacc_vf_f32m2(__riscv_vfmacc_vf_f32m2(__riscv_vfmul_vf_f32m2(_rows0, b0, vl), b1, _rows1, vl), b2, _rows2, vl), b3, _rows3, vl); - vse16_v_f16m1(Dp, vfncvt_f_f_w_f16m1(_Dp, vl), vl); + __riscv_vse16_v_f16m1(Dp, __riscv_vfncvt_f_f_w_f16m1(_Dp, vl), vl); Dp += packn; rows0p += packn; @@ -244,7 +244,7 @@ static void resize_bicubic_image_packn_fp16s(const Mat& src, Mat& dst, float* al static void resize_bicubic_image_packn_fp16sa(const Mat& src, Mat& dst, __fp16* alpha, int* xofs, __fp16* beta, int* yofs) { const int packn = csrr_vlenb() / 2; - const size_t vl = vsetvl_e16m1(packn); + const size_t vl = __riscv_vsetvl_e16m1(packn); int w = dst.w; int h = dst.h; @@ -286,13 +286,13 @@ static void resize_bicubic_image_packn_fp16sa(const Mat& src, Mat& dst, __fp16* int sx = xofs[dx] * packn; const __fp16* S3p = S3 + sx; - vfloat16m1_t _S30 = vle16_v_f16m1(S3p - packn, vl); - vfloat16m1_t _S31 = vle16_v_f16m1(S3p, vl); - vfloat16m1_t _S32 = vle16_v_f16m1(S3p + packn, vl); - vfloat16m1_t _S33 = vle16_v_f16m1(S3p + packn * 2, vl); - vfloat16m1_t _rows3 = vfmacc_vf_f16m1(vfmacc_vf_f16m1(vfmacc_vf_f16m1(vfmul_vf_f16m1(_S30, alphap[0], vl), alphap[1], _S31, vl), alphap[2], _S32, vl), alphap[3], _S33, vl); + vfloat16m1_t _S30 = __riscv_vle16_v_f16m1(S3p - packn, vl); + vfloat16m1_t _S31 = __riscv_vle16_v_f16m1(S3p, vl); + vfloat16m1_t _S32 = __riscv_vle16_v_f16m1(S3p + packn, vl); + vfloat16m1_t _S33 = __riscv_vle16_v_f16m1(S3p + packn * 2, vl); + vfloat16m1_t _rows3 = __riscv_vfmacc_vf_f16m1(__riscv_vfmacc_vf_f16m1(__riscv_vfmacc_vf_f16m1(__riscv_vfmul_vf_f16m1(_S30, alphap[0], vl), alphap[1], _S31, vl), alphap[2], _S32, vl), alphap[3], _S33, vl); - vse16_v_f16m1(rows3p + dx * packn, _rows3, vl); + __riscv_vse16_v_f16m1(rows3p + dx * packn, _rows3, vl); alphap += 4; } @@ -318,19 +318,19 @@ static void resize_bicubic_image_packn_fp16sa(const Mat& src, Mat& dst, __fp16* const __fp16* S2p = S2 + sx; const __fp16* S3p = S3 + sx; - vfloat16m1_t _S20 = vle16_v_f16m1(S2p - packn, vl); - vfloat16m1_t _S21 = vle16_v_f16m1(S2p, vl); - vfloat16m1_t _S22 = vle16_v_f16m1(S2p + packn, vl); - vfloat16m1_t _S23 = vle16_v_f16m1(S2p + packn * 2, vl); - vfloat16m1_t _S30 = vle16_v_f16m1(S3p - packn, vl); - vfloat16m1_t _S31 = vle16_v_f16m1(S3p, vl); - vfloat16m1_t _S32 = vle16_v_f16m1(S3p + packn, vl); - vfloat16m1_t _S33 = vle16_v_f16m1(S3p + packn * 2, vl); - vfloat16m1_t _rows2 = vfmacc_vf_f16m1(vfmacc_vf_f16m1(vfmacc_vf_f16m1(vfmul_vf_f16m1(_S20, alphap[0], vl), alphap[1], _S21, vl), alphap[2], _S22, vl), alphap[3], _S23, vl); - vfloat16m1_t _rows3 = vfmacc_vf_f16m1(vfmacc_vf_f16m1(vfmacc_vf_f16m1(vfmul_vf_f16m1(_S30, alphap[0], vl), alphap[1], _S31, vl), alphap[2], _S32, vl), alphap[3], _S33, vl); + vfloat16m1_t _S20 = __riscv_vle16_v_f16m1(S2p - packn, vl); + vfloat16m1_t _S21 = __riscv_vle16_v_f16m1(S2p, vl); + vfloat16m1_t _S22 = __riscv_vle16_v_f16m1(S2p + packn, vl); + vfloat16m1_t _S23 = __riscv_vle16_v_f16m1(S2p + packn * 2, vl); + vfloat16m1_t _S30 = __riscv_vle16_v_f16m1(S3p - packn, vl); + vfloat16m1_t _S31 = __riscv_vle16_v_f16m1(S3p, vl); + vfloat16m1_t _S32 = __riscv_vle16_v_f16m1(S3p + packn, vl); + vfloat16m1_t _S33 = __riscv_vle16_v_f16m1(S3p + packn * 2, vl); + vfloat16m1_t _rows2 = __riscv_vfmacc_vf_f16m1(__riscv_vfmacc_vf_f16m1(__riscv_vfmacc_vf_f16m1(__riscv_vfmul_vf_f16m1(_S20, alphap[0], vl), alphap[1], _S21, vl), alphap[2], _S22, vl), alphap[3], _S23, vl); + vfloat16m1_t _rows3 = __riscv_vfmacc_vf_f16m1(__riscv_vfmacc_vf_f16m1(__riscv_vfmacc_vf_f16m1(__riscv_vfmul_vf_f16m1(_S30, alphap[0], vl), alphap[1], _S31, vl), alphap[2], _S32, vl), alphap[3], _S33, vl); - vse16_v_f16m1(rows2p + dx * packn, _rows2, vl); - vse16_v_f16m1(rows3p + dx * packn, _rows3, vl); + __riscv_vse16_v_f16m1(rows2p + dx * packn, _rows2, vl); + __riscv_vse16_v_f16m1(rows3p + dx * packn, _rows3, vl); alphap += 4; } @@ -360,25 +360,25 @@ static void resize_bicubic_image_packn_fp16sa(const Mat& src, Mat& dst, __fp16* const __fp16* S2p = S2 + sx; const __fp16* S3p = S3 + sx; - vfloat16m1_t _S10 = vle16_v_f16m1(S1p - packn, vl); - vfloat16m1_t _S11 = vle16_v_f16m1(S1p, vl); - vfloat16m1_t _S12 = vle16_v_f16m1(S1p + packn, vl); - vfloat16m1_t _S13 = vle16_v_f16m1(S1p + packn * 2, vl); - vfloat16m1_t _S20 = vle16_v_f16m1(S2p - packn, vl); - vfloat16m1_t _S21 = vle16_v_f16m1(S2p, vl); - vfloat16m1_t _S22 = vle16_v_f16m1(S2p + packn, vl); - vfloat16m1_t _S23 = vle16_v_f16m1(S2p + packn * 2, vl); - vfloat16m1_t _S30 = vle16_v_f16m1(S3p - packn, vl); - vfloat16m1_t _S31 = vle16_v_f16m1(S3p, vl); - vfloat16m1_t _S32 = vle16_v_f16m1(S3p + packn, vl); - vfloat16m1_t _S33 = vle16_v_f16m1(S3p + packn * 2, vl); - vfloat16m1_t _rows1 = vfmacc_vf_f16m1(vfmacc_vf_f16m1(vfmacc_vf_f16m1(vfmul_vf_f16m1(_S10, alphap[0], vl), alphap[1], _S11, vl), alphap[2], _S12, vl), alphap[3], _S13, vl); - vfloat16m1_t _rows2 = vfmacc_vf_f16m1(vfmacc_vf_f16m1(vfmacc_vf_f16m1(vfmul_vf_f16m1(_S20, alphap[0], vl), alphap[1], _S21, vl), alphap[2], _S22, vl), alphap[3], _S23, vl); - vfloat16m1_t _rows3 = vfmacc_vf_f16m1(vfmacc_vf_f16m1(vfmacc_vf_f16m1(vfmul_vf_f16m1(_S30, alphap[0], vl), alphap[1], _S31, vl), alphap[2], _S32, vl), alphap[3], _S33, vl); - - vse16_v_f16m1(rows1p + dx * packn, _rows1, vl); - vse16_v_f16m1(rows2p + dx * packn, _rows2, vl); - vse16_v_f16m1(rows3p + dx * packn, _rows3, vl); + vfloat16m1_t _S10 = __riscv_vle16_v_f16m1(S1p - packn, vl); + vfloat16m1_t _S11 = __riscv_vle16_v_f16m1(S1p, vl); + vfloat16m1_t _S12 = __riscv_vle16_v_f16m1(S1p + packn, vl); + vfloat16m1_t _S13 = __riscv_vle16_v_f16m1(S1p + packn * 2, vl); + vfloat16m1_t _S20 = __riscv_vle16_v_f16m1(S2p - packn, vl); + vfloat16m1_t _S21 = __riscv_vle16_v_f16m1(S2p, vl); + vfloat16m1_t _S22 = __riscv_vle16_v_f16m1(S2p + packn, vl); + vfloat16m1_t _S23 = __riscv_vle16_v_f16m1(S2p + packn * 2, vl); + vfloat16m1_t _S30 = __riscv_vle16_v_f16m1(S3p - packn, vl); + vfloat16m1_t _S31 = __riscv_vle16_v_f16m1(S3p, vl); + vfloat16m1_t _S32 = __riscv_vle16_v_f16m1(S3p + packn, vl); + vfloat16m1_t _S33 = __riscv_vle16_v_f16m1(S3p + packn * 2, vl); + vfloat16m1_t _rows1 = __riscv_vfmacc_vf_f16m1(__riscv_vfmacc_vf_f16m1(__riscv_vfmacc_vf_f16m1(__riscv_vfmul_vf_f16m1(_S10, alphap[0], vl), alphap[1], _S11, vl), alphap[2], _S12, vl), alphap[3], _S13, vl); + vfloat16m1_t _rows2 = __riscv_vfmacc_vf_f16m1(__riscv_vfmacc_vf_f16m1(__riscv_vfmacc_vf_f16m1(__riscv_vfmul_vf_f16m1(_S20, alphap[0], vl), alphap[1], _S21, vl), alphap[2], _S22, vl), alphap[3], _S23, vl); + vfloat16m1_t _rows3 = __riscv_vfmacc_vf_f16m1(__riscv_vfmacc_vf_f16m1(__riscv_vfmacc_vf_f16m1(__riscv_vfmul_vf_f16m1(_S30, alphap[0], vl), alphap[1], _S31, vl), alphap[2], _S32, vl), alphap[3], _S33, vl); + + __riscv_vse16_v_f16m1(rows1p + dx * packn, _rows1, vl); + __riscv_vse16_v_f16m1(rows2p + dx * packn, _rows2, vl); + __riscv_vse16_v_f16m1(rows3p + dx * packn, _rows3, vl); alphap += 4; } @@ -404,31 +404,31 @@ static void resize_bicubic_image_packn_fp16sa(const Mat& src, Mat& dst, __fp16* const __fp16* S2p = S2 + sx; const __fp16* S3p = S3 + sx; - vfloat16m1_t _S00 = vle16_v_f16m1(S0p - packn, vl); - vfloat16m1_t _S01 = vle16_v_f16m1(S0p, vl); - vfloat16m1_t _S02 = vle16_v_f16m1(S0p + packn, vl); - vfloat16m1_t _S03 = vle16_v_f16m1(S0p + packn * 2, vl); - vfloat16m1_t _S10 = vle16_v_f16m1(S1p - packn, vl); - vfloat16m1_t _S11 = vle16_v_f16m1(S1p, vl); - vfloat16m1_t _S12 = vle16_v_f16m1(S1p + packn, vl); - vfloat16m1_t _S13 = vle16_v_f16m1(S1p + packn * 2, vl); - vfloat16m1_t _S20 = vle16_v_f16m1(S2p - packn, vl); - vfloat16m1_t _S21 = vle16_v_f16m1(S2p, vl); - vfloat16m1_t _S22 = vle16_v_f16m1(S2p + packn, vl); - vfloat16m1_t _S23 = vle16_v_f16m1(S2p + packn * 2, vl); - vfloat16m1_t _S30 = vle16_v_f16m1(S3p - packn, vl); - vfloat16m1_t _S31 = vle16_v_f16m1(S3p, vl); - vfloat16m1_t _S32 = vle16_v_f16m1(S3p + packn, vl); - vfloat16m1_t _S33 = vle16_v_f16m1(S3p + packn * 2, vl); - vfloat16m1_t _rows0 = vfmacc_vf_f16m1(vfmacc_vf_f16m1(vfmacc_vf_f16m1(vfmul_vf_f16m1(_S00, alphap[0], vl), alphap[1], _S01, vl), alphap[2], _S02, vl), alphap[3], _S03, vl); - vfloat16m1_t _rows1 = vfmacc_vf_f16m1(vfmacc_vf_f16m1(vfmacc_vf_f16m1(vfmul_vf_f16m1(_S10, alphap[0], vl), alphap[1], _S11, vl), alphap[2], _S12, vl), alphap[3], _S13, vl); - vfloat16m1_t _rows2 = vfmacc_vf_f16m1(vfmacc_vf_f16m1(vfmacc_vf_f16m1(vfmul_vf_f16m1(_S20, alphap[0], vl), alphap[1], _S21, vl), alphap[2], _S22, vl), alphap[3], _S23, vl); - vfloat16m1_t _rows3 = vfmacc_vf_f16m1(vfmacc_vf_f16m1(vfmacc_vf_f16m1(vfmul_vf_f16m1(_S30, alphap[0], vl), alphap[1], _S31, vl), alphap[2], _S32, vl), alphap[3], _S33, vl); - - vse16_v_f16m1(rows0p + dx * packn, _rows0, vl); - vse16_v_f16m1(rows1p + dx * packn, _rows1, vl); - vse16_v_f16m1(rows2p + dx * packn, _rows2, vl); - vse16_v_f16m1(rows3p + dx * packn, _rows3, vl); + vfloat16m1_t _S00 = __riscv_vle16_v_f16m1(S0p - packn, vl); + vfloat16m1_t _S01 = __riscv_vle16_v_f16m1(S0p, vl); + vfloat16m1_t _S02 = __riscv_vle16_v_f16m1(S0p + packn, vl); + vfloat16m1_t _S03 = __riscv_vle16_v_f16m1(S0p + packn * 2, vl); + vfloat16m1_t _S10 = __riscv_vle16_v_f16m1(S1p - packn, vl); + vfloat16m1_t _S11 = __riscv_vle16_v_f16m1(S1p, vl); + vfloat16m1_t _S12 = __riscv_vle16_v_f16m1(S1p + packn, vl); + vfloat16m1_t _S13 = __riscv_vle16_v_f16m1(S1p + packn * 2, vl); + vfloat16m1_t _S20 = __riscv_vle16_v_f16m1(S2p - packn, vl); + vfloat16m1_t _S21 = __riscv_vle16_v_f16m1(S2p, vl); + vfloat16m1_t _S22 = __riscv_vle16_v_f16m1(S2p + packn, vl); + vfloat16m1_t _S23 = __riscv_vle16_v_f16m1(S2p + packn * 2, vl); + vfloat16m1_t _S30 = __riscv_vle16_v_f16m1(S3p - packn, vl); + vfloat16m1_t _S31 = __riscv_vle16_v_f16m1(S3p, vl); + vfloat16m1_t _S32 = __riscv_vle16_v_f16m1(S3p + packn, vl); + vfloat16m1_t _S33 = __riscv_vle16_v_f16m1(S3p + packn * 2, vl); + vfloat16m1_t _rows0 = __riscv_vfmacc_vf_f16m1(__riscv_vfmacc_vf_f16m1(__riscv_vfmacc_vf_f16m1(__riscv_vfmul_vf_f16m1(_S00, alphap[0], vl), alphap[1], _S01, vl), alphap[2], _S02, vl), alphap[3], _S03, vl); + vfloat16m1_t _rows1 = __riscv_vfmacc_vf_f16m1(__riscv_vfmacc_vf_f16m1(__riscv_vfmacc_vf_f16m1(__riscv_vfmul_vf_f16m1(_S10, alphap[0], vl), alphap[1], _S11, vl), alphap[2], _S12, vl), alphap[3], _S13, vl); + vfloat16m1_t _rows2 = __riscv_vfmacc_vf_f16m1(__riscv_vfmacc_vf_f16m1(__riscv_vfmacc_vf_f16m1(__riscv_vfmul_vf_f16m1(_S20, alphap[0], vl), alphap[1], _S21, vl), alphap[2], _S22, vl), alphap[3], _S23, vl); + vfloat16m1_t _rows3 = __riscv_vfmacc_vf_f16m1(__riscv_vfmacc_vf_f16m1(__riscv_vfmacc_vf_f16m1(__riscv_vfmul_vf_f16m1(_S30, alphap[0], vl), alphap[1], _S31, vl), alphap[2], _S32, vl), alphap[3], _S33, vl); + + __riscv_vse16_v_f16m1(rows0p + dx * packn, _rows0, vl); + __riscv_vse16_v_f16m1(rows1p + dx * packn, _rows1, vl); + __riscv_vse16_v_f16m1(rows2p + dx * packn, _rows2, vl); + __riscv_vse16_v_f16m1(rows3p + dx * packn, _rows3, vl); alphap += 4; } @@ -450,14 +450,14 @@ static void resize_bicubic_image_packn_fp16sa(const Mat& src, Mat& dst, __fp16* for (int dx = 0; dx < w; dx++) { - vfloat16m1_t _rows0 = vle16_v_f16m1(rows0p, vl); - vfloat16m1_t _rows1 = vle16_v_f16m1(rows1p, vl); - vfloat16m1_t _rows2 = vle16_v_f16m1(rows2p, vl); - vfloat16m1_t _rows3 = vle16_v_f16m1(rows3p, vl); + vfloat16m1_t _rows0 = __riscv_vle16_v_f16m1(rows0p, vl); + vfloat16m1_t _rows1 = __riscv_vle16_v_f16m1(rows1p, vl); + vfloat16m1_t _rows2 = __riscv_vle16_v_f16m1(rows2p, vl); + vfloat16m1_t _rows3 = __riscv_vle16_v_f16m1(rows3p, vl); - vfloat16m1_t _Dp = vfmacc_vf_f16m1(vfmacc_vf_f16m1(vfmacc_vf_f16m1(vfmul_vf_f16m1(_rows0, b0, vl), b1, _rows1, vl), b2, _rows2, vl), b3, _rows3, vl); + vfloat16m1_t _Dp = __riscv_vfmacc_vf_f16m1(__riscv_vfmacc_vf_f16m1(__riscv_vfmacc_vf_f16m1(__riscv_vfmul_vf_f16m1(_rows0, b0, vl), b1, _rows1, vl), b2, _rows2, vl), b3, _rows3, vl); - vse16_v_f16m1(Dp, _Dp, vl); + __riscv_vse16_v_f16m1(Dp, _Dp, vl); Dp += packn; rows0p += packn; diff --git a/src/layer/riscv/interp_bilinear.h b/src/layer/riscv/interp_bilinear.h index ffd613a6573..03ae612a02e 100644 --- a/src/layer/riscv/interp_bilinear.h +++ b/src/layer/riscv/interp_bilinear.h @@ -86,21 +86,21 @@ static void resize_bilinear_image(const Mat& src, Mat& dst, float* alpha, int* x int n = w; while (n > 0) { - size_t vl = vsetvl_e32m4(n); + size_t vl = __riscv_vsetvl_e32m4(n); - vuint32m4_t _sx = vmul_vx_u32m4(vle32_v_u32m4(pxofs, vl), sizeof(float), vl); + vuint32m4_t _sx = __riscv_vmul_vx_u32m4(__riscv_vle32_v_u32m4(pxofs, vl), sizeof(float), vl); - vfloat32m4_t _S1p0; - vfloat32m4_t _S1p1; - vloxseg2ei32_v_f32m4(&_S1p0, &_S1p1, S1, _sx, vl); + vfloat32m4x2_t _S1 = __riscv_vloxseg2ei32_v_f32m4x2(S1, _sx, vl); + vfloat32m4_t _S1p0 = __riscv_vget_v_f32m4x2_f32m4(_S1, 0); + vfloat32m4_t _S1p1 = __riscv_vget_v_f32m4x2_f32m4(_S1, 1); - vfloat32m4_t _a0; - vfloat32m4_t _a1; - vlseg2e32_v_f32m4(&_a0, &_a1, alphap, vl); + vfloat32m4x2_t _a = __riscv_vlseg2e32_v_f32m4x2(alphap, vl); + vfloat32m4_t _a0 = __riscv_vget_v_f32m4x2_f32m4(_a, 0); + vfloat32m4_t _a1 = __riscv_vget_v_f32m4x2_f32m4(_a, 1); - vfloat32m4_t _rows1 = vfmacc_vv_f32m4(vfmul_vv_f32m4(_S1p0, _a0, vl), _S1p1, _a1, vl); + vfloat32m4_t _rows1 = __riscv_vfmacc_vv_f32m4(__riscv_vfmul_vv_f32m4(_S1p0, _a0, vl), _S1p1, _a1, vl); - vse32_v_f32m4(rows1p, _rows1, vl); + __riscv_vse32_v_f32m4(rows1p, _rows1, vl); pxofs += vl; alphap += vl * 2; @@ -136,27 +136,26 @@ static void resize_bilinear_image(const Mat& src, Mat& dst, float* alpha, int* x int n = w; while (n > 0) { - size_t vl = vsetvl_e32m4(n); + size_t vl = __riscv_vsetvl_e32m4(n); - vuint32m4_t _sx = vmul_vx_u32m4(vle32_v_u32m4(pxofs, vl), sizeof(float), vl); + vuint32m4_t _sx = __riscv_vmul_vx_u32m4(__riscv_vle32_v_u32m4(pxofs, vl), sizeof(float), vl); - vfloat32m4_t _S0p0; - vfloat32m4_t _S0p1; - vfloat32m4_t _S1p0; - vfloat32m4_t _S1p1; + vfloat32m4x2_t _S0 = __riscv_vloxseg2ei32_v_f32m4x2(S0, _sx, vl); + vfloat32m4x2_t _S1 = __riscv_vloxseg2ei32_v_f32m4x2(S1, _sx, vl); + vfloat32m4_t _S0p0 = __riscv_vget_v_f32m4x2_f32m4(_S0, 0); + vfloat32m4_t _S0p1 = __riscv_vget_v_f32m4x2_f32m4(_S0, 1); + vfloat32m4_t _S1p0 = __riscv_vget_v_f32m4x2_f32m4(_S1, 0); + vfloat32m4_t _S1p1 = __riscv_vget_v_f32m4x2_f32m4(_S1, 1); - vloxseg2ei32_v_f32m4(&_S0p0, &_S0p1, S0, _sx, vl); - vloxseg2ei32_v_f32m4(&_S1p0, &_S1p1, S1, _sx, vl); + vfloat32m4x2_t _a = __riscv_vlseg2e32_v_f32m4x2(alphap, vl); + vfloat32m4_t _a0 = __riscv_vget_v_f32m4x2_f32m4(_a, 0); + vfloat32m4_t _a1 = __riscv_vget_v_f32m4x2_f32m4(_a, 1); - vfloat32m4_t _a0; - vfloat32m4_t _a1; - vlseg2e32_v_f32m4(&_a0, &_a1, alphap, vl); + vfloat32m4_t _rows0 = __riscv_vfmacc_vv_f32m4(__riscv_vfmul_vv_f32m4(_S0p0, _a0, vl), _S0p1, _a1, vl); + vfloat32m4_t _rows1 = __riscv_vfmacc_vv_f32m4(__riscv_vfmul_vv_f32m4(_S1p0, _a0, vl), _S1p1, _a1, vl); - vfloat32m4_t _rows0 = vfmacc_vv_f32m4(vfmul_vv_f32m4(_S0p0, _a0, vl), _S0p1, _a1, vl); - vfloat32m4_t _rows1 = vfmacc_vv_f32m4(vfmul_vv_f32m4(_S1p0, _a0, vl), _S1p1, _a1, vl); - - vse32_v_f32m4(rows0p, _rows0, vl); - vse32_v_f32m4(rows1p, _rows1, vl); + __riscv_vse32_v_f32m4(rows0p, _rows0, vl); + __riscv_vse32_v_f32m4(rows1p, _rows1, vl); pxofs += vl; alphap += vl * 2; @@ -195,14 +194,14 @@ static void resize_bilinear_image(const Mat& src, Mat& dst, float* alpha, int* x int n = w; while (n > 0) { - size_t vl = vsetvl_e32m8(n); + size_t vl = __riscv_vsetvl_e32m8(n); - vfloat32m8_t _rows0 = vle32_v_f32m8(rows0p, vl); - vfloat32m8_t _rows1 = vle32_v_f32m8(rows1p, vl); + vfloat32m8_t _rows0 = __riscv_vle32_v_f32m8(rows0p, vl); + vfloat32m8_t _rows1 = __riscv_vle32_v_f32m8(rows1p, vl); - vfloat32m8_t _Dp = vfmacc_vf_f32m8(vfmul_vf_f32m8(_rows0, b0, vl), b1, _rows1, vl); + vfloat32m8_t _Dp = __riscv_vfmacc_vf_f32m8(__riscv_vfmul_vf_f32m8(_rows0, b0, vl), b1, _rows1, vl); - vse32_v_f32m8(Dp, _Dp, vl); + __riscv_vse32_v_f32m8(Dp, _Dp, vl); Dp += vl; rows0p += vl; diff --git a/src/layer/riscv/interp_bilinear_fp16s.h b/src/layer/riscv/interp_bilinear_fp16s.h index 318f36f8ab8..d0fcde65643 100644 --- a/src/layer/riscv/interp_bilinear_fp16s.h +++ b/src/layer/riscv/interp_bilinear_fp16s.h @@ -128,23 +128,30 @@ static void resize_bilinear_image_fp16s(const Mat& src, Mat& dst, float* alpha, float* rows1p = rows1; __fp16* Dp = dst.row<__fp16>(dy); +#if __riscv_zvfh int n = w; while (n > 0) { - size_t vl = vsetvl_e16m4(n); + size_t vl = __riscv_vsetvl_e16m4(n); - vfloat32m8_t _rows0 = vle32_v_f32m8(rows0p, vl); - vfloat32m8_t _rows1 = vle32_v_f32m8(rows1p, vl); + vfloat32m8_t _rows0 = __riscv_vle32_v_f32m8(rows0p, vl); + vfloat32m8_t _rows1 = __riscv_vle32_v_f32m8(rows1p, vl); - vfloat32m8_t _Dp = vfmacc_vf_f32m8(vfmul_vf_f32m8(_rows0, b0, vl), b1, _rows1, vl); + vfloat32m8_t _Dp = __riscv_vfmacc_vf_f32m8(__riscv_vfmul_vf_f32m8(_rows0, b0, vl), b1, _rows1, vl); - vse16_v_f16m4(Dp, vfncvt_f_f_w_f16m4(_Dp, vl), vl); + __riscv_vse16_v_f16m4(Dp, __riscv_vfncvt_f_f_w_f16m4(_Dp, vl), vl); Dp += vl; rows0p += vl; rows1p += vl; n -= vl; } +#else // __riscv_zvfh + for (int i = 0; i < w; i++) + { + *Dp++ = (__fp16)(*rows0p++ * b0 + *rows1p++ * b1); + } +#endif // __riscv_zvfh beta += 2; } @@ -229,23 +236,30 @@ static void resize_bilinear_image_fp16sa(const Mat& src, Mat& dst, __fp16* alpha __fp16* rows1p = rows1; __fp16* Dp = dst.row<__fp16>(dy); +#if __riscv_zvfh int n = w; while (n > 0) { - size_t vl = vsetvl_e16m8(n); + size_t vl = __riscv_vsetvl_e16m8(n); - vfloat16m8_t _rows0 = vle16_v_f16m8(rows0p, vl); - vfloat16m8_t _rows1 = vle16_v_f16m8(rows1p, vl); + vfloat16m8_t _rows0 = __riscv_vle16_v_f16m8(rows0p, vl); + vfloat16m8_t _rows1 = __riscv_vle16_v_f16m8(rows1p, vl); - vfloat16m8_t _Dp = vfmacc_vf_f16m8(vfmul_vf_f16m8(_rows0, b0, vl), b1, _rows1, vl); + vfloat16m8_t _Dp = __riscv_vfmacc_vf_f16m8(__riscv_vfmul_vf_f16m8(_rows0, b0, vl), b1, _rows1, vl); - vse16_v_f16m8(Dp, _Dp, vl); + __riscv_vse16_v_f16m8(Dp, _Dp, vl); Dp += vl; rows0p += vl; rows1p += vl; n -= vl; } +#else // __riscv_zvfh + for (int i = 0; i < w; i++) + { + *Dp++ = *rows0p++ * b0 + *rows1p++ * b1; + } +#endif // __riscv_zvfh beta += 2; } diff --git a/src/layer/riscv/interp_bilinear_packn.h b/src/layer/riscv/interp_bilinear_packn.h index 725651dd56f..d5537738361 100644 --- a/src/layer/riscv/interp_bilinear_packn.h +++ b/src/layer/riscv/interp_bilinear_packn.h @@ -15,7 +15,7 @@ static void resize_bilinear_image_packn(const Mat& src, Mat& dst, float* alpha, int* xofs, float* beta, int* yofs) { const int packn = csrr_vlenb() / 4; - const size_t vl = vsetvl_e32m1(packn); + const size_t vl = __riscv_vsetvl_e32m1(packn); int w = dst.w; int h = dst.h; @@ -52,11 +52,11 @@ static void resize_bilinear_image_packn(const Mat& src, Mat& dst, float* alpha, int sx = xofs[dx] * packn; const float* S1p = S1 + sx; - vfloat32m1_t _S10 = vle32_v_f32m1(S1p, vl); - vfloat32m1_t _S11 = vle32_v_f32m1(S1p + packn, vl); - vfloat32m1_t _rows1 = vfmacc_vf_f32m1(vfmul_vf_f32m1(_S10, alphap[0], vl), alphap[1], _S11, vl); + vfloat32m1_t _S10 = __riscv_vle32_v_f32m1(S1p, vl); + vfloat32m1_t _S11 = __riscv_vle32_v_f32m1(S1p + packn, vl); + vfloat32m1_t _rows1 = __riscv_vfmacc_vf_f32m1(__riscv_vfmul_vf_f32m1(_S10, alphap[0], vl), alphap[1], _S11, vl); - vse32_v_f32m1(rows1p + dx * packn, _rows1, vl); + __riscv_vse32_v_f32m1(rows1p + dx * packn, _rows1, vl); alphap += 2; } @@ -77,15 +77,15 @@ static void resize_bilinear_image_packn(const Mat& src, Mat& dst, float* alpha, const float* S0p = S0 + sx; const float* S1p = S1 + sx; - vfloat32m1_t _S00 = vle32_v_f32m1(S0p, vl); - vfloat32m1_t _S01 = vle32_v_f32m1(S0p + packn, vl); - vfloat32m1_t _S10 = vle32_v_f32m1(S1p, vl); - vfloat32m1_t _S11 = vle32_v_f32m1(S1p + packn, vl); - vfloat32m1_t _rows0 = vfmacc_vf_f32m1(vfmul_vf_f32m1(_S00, alphap[0], vl), alphap[1], _S01, vl); - vfloat32m1_t _rows1 = vfmacc_vf_f32m1(vfmul_vf_f32m1(_S10, alphap[0], vl), alphap[1], _S11, vl); + vfloat32m1_t _S00 = __riscv_vle32_v_f32m1(S0p, vl); + vfloat32m1_t _S01 = __riscv_vle32_v_f32m1(S0p + packn, vl); + vfloat32m1_t _S10 = __riscv_vle32_v_f32m1(S1p, vl); + vfloat32m1_t _S11 = __riscv_vle32_v_f32m1(S1p + packn, vl); + vfloat32m1_t _rows0 = __riscv_vfmacc_vf_f32m1(__riscv_vfmul_vf_f32m1(_S00, alphap[0], vl), alphap[1], _S01, vl); + vfloat32m1_t _rows1 = __riscv_vfmacc_vf_f32m1(__riscv_vfmul_vf_f32m1(_S10, alphap[0], vl), alphap[1], _S11, vl); - vse32_v_f32m1(rows0p + dx * packn, _rows0, vl); - vse32_v_f32m1(rows1p + dx * packn, _rows1, vl); + __riscv_vse32_v_f32m1(rows0p + dx * packn, _rows0, vl); + __riscv_vse32_v_f32m1(rows1p + dx * packn, _rows1, vl); alphap += 2; } @@ -103,12 +103,12 @@ static void resize_bilinear_image_packn(const Mat& src, Mat& dst, float* alpha, for (int dx = 0; dx < w; dx++) { - vfloat32m1_t _rows0 = vle32_v_f32m1(rows0p, vl); - vfloat32m1_t _rows1 = vle32_v_f32m1(rows1p, vl); + vfloat32m1_t _rows0 = __riscv_vle32_v_f32m1(rows0p, vl); + vfloat32m1_t _rows1 = __riscv_vle32_v_f32m1(rows1p, vl); - vfloat32m1_t _Dp = vfmacc_vf_f32m1(vfmul_vf_f32m1(_rows0, b0, vl), b1, _rows1, vl); + vfloat32m1_t _Dp = __riscv_vfmacc_vf_f32m1(__riscv_vfmul_vf_f32m1(_rows0, b0, vl), b1, _rows1, vl); - vse32_v_f32m1(Dp, _Dp, vl); + __riscv_vse32_v_f32m1(Dp, _Dp, vl); Dp += packn; rows0p += packn; diff --git a/src/layer/riscv/interp_bilinear_packn_fp16s.h b/src/layer/riscv/interp_bilinear_packn_fp16s.h index bfa239431f1..b2c195d46ff 100644 --- a/src/layer/riscv/interp_bilinear_packn_fp16s.h +++ b/src/layer/riscv/interp_bilinear_packn_fp16s.h @@ -15,7 +15,7 @@ static void resize_bilinear_image_packn_fp16s(const Mat& src, Mat& dst, float* alpha, int* xofs, float* beta, int* yofs) { const int packn = csrr_vlenb() / 2; - const size_t vl = vsetvl_e16m1(packn); + const size_t vl = __riscv_vsetvl_e16m1(packn); int w = dst.w; int h = dst.h; @@ -52,11 +52,11 @@ static void resize_bilinear_image_packn_fp16s(const Mat& src, Mat& dst, float* a int sx = xofs[dx] * packn; const __fp16* S1p = S1 + sx; - vfloat16m1_t _S10 = vle16_v_f16m1(S1p, vl); - vfloat16m1_t _S11 = vle16_v_f16m1(S1p + packn, vl); - vfloat32m2_t _rows1 = vfwmacc_vf_f32m2(vfwmul_vf_f32m2(_S10, alphap[0], vl), alphap[1], _S11, vl); + vfloat16m1_t _S10 = __riscv_vle16_v_f16m1(S1p, vl); + vfloat16m1_t _S11 = __riscv_vle16_v_f16m1(S1p + packn, vl); + vfloat32m2_t _rows1 = __riscv_vfwmacc_vf_f32m2(__riscv_vfwmul_vf_f32m2(_S10, alphap[0], vl), alphap[1], _S11, vl); - vse32_v_f32m2(rows1p + dx * packn, _rows1, vl); + __riscv_vse32_v_f32m2(rows1p + dx * packn, _rows1, vl); alphap += 2; } @@ -77,15 +77,15 @@ static void resize_bilinear_image_packn_fp16s(const Mat& src, Mat& dst, float* a const __fp16* S0p = S0 + sx; const __fp16* S1p = S1 + sx; - vfloat16m1_t _S00 = vle16_v_f16m1(S0p, vl); - vfloat16m1_t _S01 = vle16_v_f16m1(S0p + packn, vl); - vfloat16m1_t _S10 = vle16_v_f16m1(S1p, vl); - vfloat16m1_t _S11 = vle16_v_f16m1(S1p + packn, vl); - vfloat32m2_t _rows0 = vfwmacc_vf_f32m2(vfwmul_vf_f32m2(_S00, alphap[0], vl), alphap[1], _S01, vl); - vfloat32m2_t _rows1 = vfwmacc_vf_f32m2(vfwmul_vf_f32m2(_S10, alphap[0], vl), alphap[1], _S11, vl); + vfloat16m1_t _S00 = __riscv_vle16_v_f16m1(S0p, vl); + vfloat16m1_t _S01 = __riscv_vle16_v_f16m1(S0p + packn, vl); + vfloat16m1_t _S10 = __riscv_vle16_v_f16m1(S1p, vl); + vfloat16m1_t _S11 = __riscv_vle16_v_f16m1(S1p + packn, vl); + vfloat32m2_t _rows0 = __riscv_vfwmacc_vf_f32m2(__riscv_vfwmul_vf_f32m2(_S00, alphap[0], vl), alphap[1], _S01, vl); + vfloat32m2_t _rows1 = __riscv_vfwmacc_vf_f32m2(__riscv_vfwmul_vf_f32m2(_S10, alphap[0], vl), alphap[1], _S11, vl); - vse32_v_f32m2(rows0p + dx * packn, _rows0, vl); - vse32_v_f32m2(rows1p + dx * packn, _rows1, vl); + __riscv_vse32_v_f32m2(rows0p + dx * packn, _rows0, vl); + __riscv_vse32_v_f32m2(rows1p + dx * packn, _rows1, vl); alphap += 2; } @@ -103,12 +103,12 @@ static void resize_bilinear_image_packn_fp16s(const Mat& src, Mat& dst, float* a for (int dx = 0; dx < w; dx++) { - vfloat32m2_t _rows0 = vle32_v_f32m2(rows0p, vl); - vfloat32m2_t _rows1 = vle32_v_f32m2(rows1p, vl); + vfloat32m2_t _rows0 = __riscv_vle32_v_f32m2(rows0p, vl); + vfloat32m2_t _rows1 = __riscv_vle32_v_f32m2(rows1p, vl); - vfloat32m2_t _Dp = vfmacc_vf_f32m2(vfmul_vf_f32m2(_rows0, b0, vl), b1, _rows1, vl); + vfloat32m2_t _Dp = __riscv_vfmacc_vf_f32m2(__riscv_vfmul_vf_f32m2(_rows0, b0, vl), b1, _rows1, vl); - vse16_v_f16m1(Dp, vfncvt_f_f_w_f16m1(_Dp, vl), vl); + __riscv_vse16_v_f16m1(Dp, __riscv_vfncvt_f_f_w_f16m1(_Dp, vl), vl); Dp += packn; rows0p += packn; @@ -122,7 +122,7 @@ static void resize_bilinear_image_packn_fp16s(const Mat& src, Mat& dst, float* a static void resize_bilinear_image_packn_fp16sa(const Mat& src, Mat& dst, __fp16* alpha, int* xofs, __fp16* beta, int* yofs) { const int packn = csrr_vlenb() / 2; - const size_t vl = vsetvl_e16m1(packn); + const size_t vl = __riscv_vsetvl_e16m1(packn); int w = dst.w; int h = dst.h; @@ -159,11 +159,11 @@ static void resize_bilinear_image_packn_fp16sa(const Mat& src, Mat& dst, __fp16* int sx = xofs[dx] * packn; const __fp16* S1p = S1 + sx; - vfloat16m1_t _S10 = vle16_v_f16m1(S1p, vl); - vfloat16m1_t _S11 = vle16_v_f16m1(S1p + packn, vl); - vfloat16m1_t _rows1 = vfmacc_vf_f16m1(vfmul_vf_f16m1(_S10, alphap[0], vl), alphap[1], _S11, vl); + vfloat16m1_t _S10 = __riscv_vle16_v_f16m1(S1p, vl); + vfloat16m1_t _S11 = __riscv_vle16_v_f16m1(S1p + packn, vl); + vfloat16m1_t _rows1 = __riscv_vfmacc_vf_f16m1(__riscv_vfmul_vf_f16m1(_S10, alphap[0], vl), alphap[1], _S11, vl); - vse16_v_f16m1(rows1p + dx * packn, _rows1, vl); + __riscv_vse16_v_f16m1(rows1p + dx * packn, _rows1, vl); alphap += 2; } @@ -184,15 +184,15 @@ static void resize_bilinear_image_packn_fp16sa(const Mat& src, Mat& dst, __fp16* const __fp16* S0p = S0 + sx; const __fp16* S1p = S1 + sx; - vfloat16m1_t _S00 = vle16_v_f16m1(S0p, vl); - vfloat16m1_t _S01 = vle16_v_f16m1(S0p + packn, vl); - vfloat16m1_t _S10 = vle16_v_f16m1(S1p, vl); - vfloat16m1_t _S11 = vle16_v_f16m1(S1p + packn, vl); - vfloat16m1_t _rows0 = vfmacc_vf_f16m1(vfmul_vf_f16m1(_S00, alphap[0], vl), alphap[1], _S01, vl); - vfloat16m1_t _rows1 = vfmacc_vf_f16m1(vfmul_vf_f16m1(_S10, alphap[0], vl), alphap[1], _S11, vl); + vfloat16m1_t _S00 = __riscv_vle16_v_f16m1(S0p, vl); + vfloat16m1_t _S01 = __riscv_vle16_v_f16m1(S0p + packn, vl); + vfloat16m1_t _S10 = __riscv_vle16_v_f16m1(S1p, vl); + vfloat16m1_t _S11 = __riscv_vle16_v_f16m1(S1p + packn, vl); + vfloat16m1_t _rows0 = __riscv_vfmacc_vf_f16m1(__riscv_vfmul_vf_f16m1(_S00, alphap[0], vl), alphap[1], _S01, vl); + vfloat16m1_t _rows1 = __riscv_vfmacc_vf_f16m1(__riscv_vfmul_vf_f16m1(_S10, alphap[0], vl), alphap[1], _S11, vl); - vse16_v_f16m1(rows0p + dx * packn, _rows0, vl); - vse16_v_f16m1(rows1p + dx * packn, _rows1, vl); + __riscv_vse16_v_f16m1(rows0p + dx * packn, _rows0, vl); + __riscv_vse16_v_f16m1(rows1p + dx * packn, _rows1, vl); alphap += 2; } @@ -210,12 +210,12 @@ static void resize_bilinear_image_packn_fp16sa(const Mat& src, Mat& dst, __fp16* for (int dx = 0; dx < w; dx++) { - vfloat16m1_t _rows0 = vle16_v_f16m1(rows0p, vl); - vfloat16m1_t _rows1 = vle16_v_f16m1(rows1p, vl); + vfloat16m1_t _rows0 = __riscv_vle16_v_f16m1(rows0p, vl); + vfloat16m1_t _rows1 = __riscv_vle16_v_f16m1(rows1p, vl); - vfloat16m1_t _Dp = vfmacc_vf_f16m1(vfmul_vf_f16m1(_rows0, b0, vl), b1, _rows1, vl); + vfloat16m1_t _Dp = __riscv_vfmacc_vf_f16m1(__riscv_vfmul_vf_f16m1(_rows0, b0, vl), b1, _rows1, vl); - vse16_v_f16m1(Dp, _Dp, vl); + __riscv_vse16_v_f16m1(Dp, _Dp, vl); Dp += packn; rows0p += packn; diff --git a/src/layer/riscv/interp_riscv.cpp b/src/layer/riscv/interp_riscv.cpp index ac72cf9b63c..1ecb2eaf96d 100644 --- a/src/layer/riscv/interp_riscv.cpp +++ b/src/layer/riscv/interp_riscv.cpp @@ -19,6 +19,8 @@ #include "riscv_usability.h" #endif // __riscv_vector +#include "cpu.h" + namespace ncnn { #include "interp_bicubic.h" @@ -27,22 +29,20 @@ namespace ncnn { #if __riscv_vector #include "interp_bicubic_packn.h" #include "interp_bilinear_packn.h" -#if __riscv_zfh -#include "interp_bicubic_fp16s.h" -#include "interp_bicubic_packn_fp16s.h" -#include "interp_bilinear_fp16s.h" -#include "interp_bilinear_packn_fp16s.h" -#endif #endif Interp_riscv::Interp_riscv() { #if __riscv_vector support_packing = true; -#if __riscv_zfh - support_fp16_storage = true; -#endif #endif // __riscv_vector +#if NCNN_ZFH +#if __riscv_vector + support_fp16_storage = cpu_support_riscv_zvfh(); +#else + support_fp16_storage = cpu_support_riscv_zfh(); +#endif +#endif } int Interp_riscv::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const @@ -51,9 +51,9 @@ int Interp_riscv::forward(const std::vector& bottom_blobs, std::vector const Mat& reference_blob = bottom_blobs[1]; Mat& top_blob = top_blobs[0]; +#if NCNN_ZFH int elembits = bottom_blob.elembits(); -#if __riscv_vector && __riscv_zfh if (opt.use_fp16_storage && elembits == 16) { if (opt.use_fp16_arithmetic) @@ -86,13 +86,13 @@ int Interp_riscv::forward(const std::vector& bottom_blobs, std::vector #if __riscv_vector if (elempack == packn) { - const size_t vl = vsetvl_e32m1(packn); + const size_t vl = __riscv_vsetvl_e32m1(packn); #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < w; q++) { Mat top_blob_c = top_blob.channel(q); - vfloat32m1_t _v = vle32_v_f32m1((const float*)bottom_blob + q * packn, vl); + vfloat32m1_t _v = __riscv_vle32_v_f32m1((const float*)bottom_blob + q * packn, vl); top_blob_c.fill(_v); } @@ -128,7 +128,7 @@ int Interp_riscv::forward(const std::vector& bottom_blobs, std::vector { if (resize_type == 1) // nearest { - const size_t vl = vsetvl_e32m1(packn); + const size_t vl = __riscv_vsetvl_e32m1(packn); const float ws = output_width ? w / (float)outw : 1.f / width_scale; @@ -141,8 +141,8 @@ int Interp_riscv::forward(const std::vector& bottom_blobs, std::vector { int in_x = std::min((int)(x * ws), (w - 1)); - vfloat32m1_t _p = vle32_v_f32m1(ptr + in_x * packn, vl); - vse32_v_f32m1(outptr, _p, vl); + vfloat32m1_t _p = __riscv_vle32_v_f32m1(ptr + in_x * packn, vl); + __riscv_vse32_v_f32m1(outptr, _p, vl); outptr += packn; } @@ -151,7 +151,7 @@ int Interp_riscv::forward(const std::vector& bottom_blobs, std::vector if (resize_type == 2) // bilinear { - const size_t vl = vsetvl_e32m1(packn); + const size_t vl = __riscv_vsetvl_e32m1(packn); int* buf = new int[outw + outw * packn]; @@ -172,11 +172,11 @@ int Interp_riscv::forward(const std::vector& bottom_blobs, std::vector int sx = xofs[x] * packn; const float* Sp = ptr + sx; - vfloat32m1_t _S0 = vle32_v_f32m1(Sp, vl); - vfloat32m1_t _S1 = vle32_v_f32m1(Sp + packn, vl); - vfloat32m1_t _p = vfmacc_vf_f32m1(vfmul_vf_f32m1(_S0, alphap[0], vl), alphap[1], _S1, vl); + vfloat32m1_t _S0 = __riscv_vle32_v_f32m1(Sp, vl); + vfloat32m1_t _S1 = __riscv_vle32_v_f32m1(Sp + packn, vl); + vfloat32m1_t _p = __riscv_vfmacc_vf_f32m1(__riscv_vfmul_vf_f32m1(_S0, alphap[0], vl), alphap[1], _S1, vl); - vse32_v_f32m1(outptr, _p, vl); + __riscv_vse32_v_f32m1(outptr, _p, vl); alphap += 2; outptr += packn; @@ -188,7 +188,7 @@ int Interp_riscv::forward(const std::vector& bottom_blobs, std::vector if (resize_type == 3) // bicubic { - const size_t vl = vsetvl_e32m1(packn); + const size_t vl = __riscv_vsetvl_e32m1(packn); int* buf = new int[outw + outw * packn]; @@ -209,13 +209,13 @@ int Interp_riscv::forward(const std::vector& bottom_blobs, std::vector int sx = xofs[x] * packn; const float* Sp = ptr + sx; - vfloat32m1_t _S0 = vle32_v_f32m1(Sp - packn, vl); - vfloat32m1_t _S1 = vle32_v_f32m1(Sp, vl); - vfloat32m1_t _S2 = vle32_v_f32m1(Sp + packn, vl); - vfloat32m1_t _S3 = vle32_v_f32m1(Sp + packn * 2, vl); - vfloat32m1_t _p = vfmacc_vf_f32m1(vfmacc_vf_f32m1(vfmacc_vf_f32m1(vfmul_vf_f32m1(_S0, alphap[0], vl), alphap[1], _S1, vl), alphap[2], _S2, vl), alphap[3], _S3, vl); + vfloat32m1_t _S0 = __riscv_vle32_v_f32m1(Sp - packn, vl); + vfloat32m1_t _S1 = __riscv_vle32_v_f32m1(Sp, vl); + vfloat32m1_t _S2 = __riscv_vle32_v_f32m1(Sp + packn, vl); + vfloat32m1_t _S3 = __riscv_vle32_v_f32m1(Sp + packn * 2, vl); + vfloat32m1_t _p = __riscv_vfmacc_vf_f32m1(__riscv_vfmacc_vf_f32m1(__riscv_vfmacc_vf_f32m1(__riscv_vfmul_vf_f32m1(_S0, alphap[0], vl), alphap[1], _S1, vl), alphap[2], _S2, vl), alphap[3], _S3, vl); - vse32_v_f32m1(outptr, _p, vl); + __riscv_vse32_v_f32m1(outptr, _p, vl); alphap += 4; outptr += packn; @@ -326,7 +326,7 @@ int Interp_riscv::forward(const std::vector& bottom_blobs, std::vector { if (resize_type == 1) // nearest { - const size_t vl = vsetvl_e32m1(packn); + const size_t vl = __riscv_vsetvl_e32m1(packn); const float hs = output_height ? h / (float)outh : 1.f / height_scale; const float ws = output_width ? w / (float)outw : 1.f / width_scale; @@ -347,8 +347,8 @@ int Interp_riscv::forward(const std::vector& bottom_blobs, std::vector { int in_x = std::min((int)(x * ws), (w - 1)); - vfloat32m1_t _p = vle32_v_f32m1(ptr + in_x * packn, vl); - vse32_v_f32m1(outptr, _p, vl); + vfloat32m1_t _p = __riscv_vle32_v_f32m1(ptr + in_x * packn, vl); + __riscv_vse32_v_f32m1(outptr, _p, vl); outptr += packn; } @@ -489,729 +489,4 @@ int Interp_riscv::forward(const std::vector& bottom_blobs, std::vector return 0; } -#if __riscv_vector && __riscv_zfh -int Interp_riscv::forward_fp16s(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const -{ - const int packn = csrr_vlenb() / 2; - - const Mat& bottom_blob = bottom_blobs[0]; - const Mat& reference_blob = bottom_blobs[1]; - Mat& top_blob = top_blobs[0]; - - int h = bottom_blob.h; - int w = bottom_blob.w; - int channels = bottom_blob.c; - int dims = bottom_blob.dims; - size_t elemsize = bottom_blob.elemsize; - int elempack = bottom_blob.elempack; - - int outw = reference_blob.w; - int outh = reference_blob.h; - - if (dims == 1) - { - top_blob.create(outw, outh, w, elemsize, elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (elempack == packn) - { - const size_t vl = vsetvl_e16m1(packn); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < w; q++) - { - Mat top_blob_c = top_blob.channel(q); - vfloat16m1_t _v = vle16_v_f16m1((const __fp16*)bottom_blob + q * packn, vl); - top_blob_c.fill(_v); - } - - return 0; - } - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < w; q++) - { - Mat top_blob_c = top_blob.channel(q); - const __fp16* ptr = bottom_blob; - top_blob_c.fill(ptr[q]); - } - - return 0; - } - - if (dims == 2) - { - if (outw == w) - { - top_blob = bottom_blob; - return 0; - } - - top_blob.create(outw, h, elemsize, elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (elempack == packn) - { - if (resize_type == 1) // nearest - { - const size_t vl = vsetvl_e16m1(packn); - - const float ws = output_width ? w / (float)outw : 1.f / width_scale; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < h; y++) - { - const __fp16* ptr = bottom_blob.row(y); - __fp16* outptr = top_blob.row<__fp16>(y); - for (int x = 0; x < outw; x++) - { - int in_x = std::min((int)(x * ws), (w - 1)); - - vfloat16m1_t _p = vle16_v_f16m1(ptr + in_x * packn, vl); - vse16_v_f16m1(outptr, _p, vl); - - outptr += packn; - } - } - } - - if (resize_type == 2) // bilinear - { - const size_t vl = vsetvl_e16m1(packn); - - int* buf = new int[outw + outw * packn]; - - int* xofs = buf; - float* alpha = (float*)(buf + outw); - - linear_coeffs(w, outw, xofs, alpha, align_corner); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < h; y++) - { - const __fp16* ptr = bottom_blob.row(y); - __fp16* outptr = top_blob.row<__fp16>(y); - const float* alphap = alpha; - - for (int x = 0; x < outw; x++) - { - int sx = xofs[x] * packn; - const __fp16* Sp = ptr + sx; - - vfloat16m1_t _S0 = vle16_v_f16m1(Sp, vl); - vfloat16m1_t _S1 = vle16_v_f16m1(Sp + packn, vl); - vfloat32m2_t _p = vfwmacc_vf_f32m2(vfwmul_vf_f32m2(_S0, alphap[0], vl), alphap[1], _S1, vl); - - vse16_v_f16m1(outptr, vfncvt_f_f_w_f16m1(_p, vl), vl); - - alphap += 2; - outptr += packn; - } - } - - delete[] buf; - } - - if (resize_type == 3) // bicubic - { - const size_t vl = vsetvl_e16m1(packn); - - int* buf = new int[outw + outw * packn]; - - int* xofs = buf; - float* alpha = (float*)(buf + outw); - - cubic_coeffs(w, outw, xofs, alpha, align_corner); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < h; y++) - { - const __fp16* ptr = bottom_blob.row(y); - __fp16* outptr = top_blob.row<__fp16>(y); - const float* alphap = alpha; - - for (int x = 0; x < outw; x++) - { - int sx = xofs[x] * packn; - const __fp16* Sp = ptr + sx; - - vfloat16m1_t _S0 = vle16_v_f16m1(Sp - packn, vl); - vfloat16m1_t _S1 = vle16_v_f16m1(Sp, vl); - vfloat16m1_t _S2 = vle16_v_f16m1(Sp + packn, vl); - vfloat16m1_t _S3 = vle16_v_f16m1(Sp + packn * 2, vl); - vfloat32m2_t _p = vfwmacc_vf_f32m2(vfwmacc_vf_f32m2(vfwmacc_vf_f32m2(vfwmul_vf_f32m2(_S0, alphap[0], vl), alphap[1], _S1, vl), alphap[2], _S2, vl), alphap[3], _S3, vl); - - vse16_v_f16m1(outptr, vfncvt_f_f_w_f16m1(_p, vl), vl); - - alphap += 4; - outptr += packn; - } - } - - delete[] buf; - } - - return 0; - } - - if (resize_type == 1) // nearest - { - const float ws = output_width ? w / (float)outw : 1.f / width_scale; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < h; y++) - { - const __fp16* ptr = bottom_blob.row(y); - __fp16* outptr = top_blob.row<__fp16>(y); - for (int x = 0; x < outw; x++) - { - int in_x = std::min((int)(x * ws), (w - 1)); - *outptr++ = ptr[in_x]; - } - } - } - - if (resize_type == 2) // bilinear - { - int* buf = new int[outw + outw * 2]; - - int* xofs = buf; - float* alpha = (float*)(buf + outw); - - linear_coeffs(w, outw, xofs, alpha, align_corner); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < h; y++) - { - const __fp16* ptr = bottom_blob.row(y); - __fp16* outptr = top_blob.row<__fp16>(y); - const float* alphap = alpha; - - for (int x = 0; x < outw; x++) - { - int sx = xofs[x]; - const __fp16* Sp = ptr + sx; - float a0 = alphap[0]; - float a1 = alphap[1]; - *outptr++ = (__fp16)((float)Sp[0] * a0 + (float)Sp[1] * a1); - alphap += 2; - } - } - - delete[] buf; - } - - if (resize_type == 3) // bicubic - { - int* buf = new int[outw + outw * 4]; - - int* xofs = buf; - float* alpha = (float*)(buf + outw); - - cubic_coeffs(w, outw, xofs, alpha, align_corner); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < h; y++) - { - const __fp16* ptr = bottom_blob.row(y); - __fp16* outptr = top_blob.row<__fp16>(y); - const float* alphap = alpha; - - for (int x = 0; x < outw; x++) - { - int sx = xofs[x]; - const __fp16* Sp = ptr + sx; - float a0 = alphap[0]; - float a1 = alphap[1]; - float a2 = alphap[2]; - float a3 = alphap[3]; - *outptr++ = (__fp16)((float)Sp[-1] * a0 + (float)Sp[0] * a1 + (float)Sp[1] * a2 + (float)Sp[2] * a3); - alphap += 4; - } - } - - delete[] buf; - } - - return 0; - } - - if (outw == w && outh == h) - { - top_blob = bottom_blob; - return 0; - } - - top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (elempack == packn) - { - if (resize_type == 1) // nearest - { - const size_t vl = vsetvl_e16m1(packn); - - const float hs = output_height ? h / (float)outh : 1.f / height_scale; - const float ws = output_width ? w / (float)outw : 1.f / width_scale; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const Mat src = bottom_blob.channel(q); - Mat dst = top_blob.channel(q); - - for (int y = 0; y < outh; y++) - { - int in_y = std::min((int)(y * hs), (h - 1)); - - const __fp16* ptr = src.row(in_y); - __fp16* outptr = dst.row<__fp16>(y); - for (int x = 0; x < outw; x++) - { - int in_x = std::min((int)(x * ws), (w - 1)); - - vfloat16m1_t _p = vle16_v_f16m1(ptr + in_x * packn, vl); - vse16_v_f16m1(outptr, _p, vl); - - outptr += packn; - } - } - } - } - - if (resize_type == 2) // bilinear - { - int* buf = new int[outw + outh + outw * 2 + outh * 2]; - - int* xofs = buf; //new int[outw]; - int* yofs = buf + outw; //new int[outh]; - - float* alpha = (float*)(buf + outw + outh); //new float[outw * 2]; - float* beta = (float*)(buf + outw + outh + outw * 2); //new float[outh * 2]; - - linear_coeffs(w, outw, xofs, alpha, align_corner); - linear_coeffs(h, outh, yofs, beta, align_corner); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const Mat src = bottom_blob.channel(q); - Mat dst = top_blob.channel(q); - - resize_bilinear_image_packn_fp16s(src, dst, alpha, xofs, beta, yofs); - } - - delete[] buf; - } - - if (resize_type == 3) // bicubic - { - int* buf = new int[outw + outh + outw * 4 + outh * 4]; - - int* xofs = buf; //new int[outw]; - int* yofs = buf + outw; //new int[outh]; - - float* alpha = (float*)(buf + outw + outh); //new float[outw * 4]; - float* beta = (float*)(buf + outw + outh + outw * 4); //new float[outh * 4]; - - cubic_coeffs(w, outw, xofs, alpha, align_corner); - cubic_coeffs(h, outh, yofs, beta, align_corner); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const Mat src = bottom_blob.channel(q); - Mat dst = top_blob.channel(q); - - resize_bicubic_image_packn_fp16s(src, dst, alpha, xofs, beta, yofs); - } - - delete[] buf; - } - - return 0; - } - - if (resize_type == 1) // nearest - { - const float hs = output_height ? h / (float)outh : 1.f / height_scale; - const float ws = output_width ? w / (float)outw : 1.f / width_scale; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const Mat src = bottom_blob.channel(q); - Mat dst = top_blob.channel(q); - - for (int y = 0; y < outh; y++) - { - int in_y = std::min((int)(y * hs), (h - 1)); - - const __fp16* ptr = src.row(in_y); - __fp16* outptr = dst.row<__fp16>(y); - for (int x = 0; x < outw; x++) - { - int in_x = std::min((int)(x * ws), (w - 1)); - *outptr++ = ptr[in_x]; - } - } - } - } - - if (resize_type == 2) // bilinear - { - int* buf = new int[outw + outh + outw * 2 + outh * 2]; - - int* xofs = buf; //new int[outw]; - int* yofs = buf + outw; //new int[outh]; - - float* alpha = (float*)(buf + outw + outh); //new float[outw * 2]; - float* beta = (float*)(buf + outw + outh + outw * 2); //new float[outh * 2]; - - linear_coeffs(w, outw, xofs, alpha, align_corner); - linear_coeffs(h, outh, yofs, beta, align_corner); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const Mat src = bottom_blob.channel(q); - Mat dst = top_blob.channel(q); - - resize_bilinear_image_fp16s(src, dst, alpha, xofs, beta, yofs); - } - - delete[] buf; - } - - if (resize_type == 3) // bicubic - { - int* buf = new int[outw + outh + outw * 4 + outh * 4]; - - int* xofs = buf; //new int[outw]; - int* yofs = buf + outw; //new int[outh]; - - float* alpha = (float*)(buf + outw + outh); //new float[outw * 4]; - float* beta = (float*)(buf + outw + outh + outw * 4); //new float[outh * 4]; - - cubic_coeffs(w, outw, xofs, alpha, align_corner); - cubic_coeffs(h, outh, yofs, beta, align_corner); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const Mat src = bottom_blob.channel(q); - Mat dst = top_blob.channel(q); - - resize_bicubic_image_fp16s(src, dst, alpha, xofs, beta, yofs); - } - - delete[] buf; - } - - return 0; -} - -int Interp_riscv::forward_fp16sa(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const -{ - const int packn = csrr_vlenb() / 2; - - const Mat& bottom_blob = bottom_blobs[0]; - const Mat& reference_blob = bottom_blobs[1]; - Mat& top_blob = top_blobs[0]; - - int h = bottom_blob.h; - int w = bottom_blob.w; - int channels = bottom_blob.c; - int dims = bottom_blob.dims; - size_t elemsize = bottom_blob.elemsize; - int elempack = bottom_blob.elempack; - - int outw = reference_blob.w; - int outh = reference_blob.h; - - if (dims == 1 || resize_type == 1) // nearest - { - return forward_fp16s(bottom_blobs, top_blobs, opt); - } - - if (dims == 2) - { - if (outw == w) - { - top_blob = bottom_blob; - return 0; - } - - top_blob.create(outw, h, elemsize, elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (elempack == packn) - { - if (resize_type == 2) // bilinear - { - const size_t vl = vsetvl_e16m1(packn); - - int* buf = new int[outw + outw * packn]; - - int* xofs = buf; - __fp16* alpha = (__fp16*)(buf + outw); - - linear_coeffs_fp16sa(w, outw, xofs, alpha, align_corner); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < h; y++) - { - const __fp16* ptr = bottom_blob.row(y); - __fp16* outptr = top_blob.row<__fp16>(y); - const __fp16* alphap = alpha; - - for (int x = 0; x < outw; x++) - { - int sx = xofs[x] * packn; - const __fp16* Sp = ptr + sx; - - vfloat16m1_t _S0 = vle16_v_f16m1(Sp, vl); - vfloat16m1_t _S1 = vle16_v_f16m1(Sp + packn, vl); - vfloat16m1_t _p = vfmacc_vf_f16m1(vfmul_vf_f16m1(_S0, alphap[0], vl), alphap[1], _S1, vl); - - vse16_v_f16m1(outptr, _p, vl); - - alphap += 2; - outptr += packn; - } - } - - delete[] buf; - } - - if (resize_type == 3) // bicubic - { - const size_t vl = vsetvl_e16m1(packn); - - int* buf = new int[outw + outw * packn]; - - int* xofs = buf; - __fp16* alpha = (__fp16*)(buf + outw); - - cubic_coeffs_fp16sa(w, outw, xofs, alpha, align_corner); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < h; y++) - { - const __fp16* ptr = bottom_blob.row(y); - __fp16* outptr = top_blob.row<__fp16>(y); - const __fp16* alphap = alpha; - - for (int x = 0; x < outw; x++) - { - int sx = xofs[x] * packn; - const __fp16* Sp = ptr + sx; - - vfloat16m1_t _S0 = vle16_v_f16m1(Sp - packn, vl); - vfloat16m1_t _S1 = vle16_v_f16m1(Sp, vl); - vfloat16m1_t _S2 = vle16_v_f16m1(Sp + packn, vl); - vfloat16m1_t _S3 = vle16_v_f16m1(Sp + packn * 2, vl); - vfloat16m1_t _p = vfmacc_vf_f16m1(vfmacc_vf_f16m1(vfmacc_vf_f16m1(vfmul_vf_f16m1(_S0, alphap[0], vl), alphap[1], _S1, vl), alphap[2], _S2, vl), alphap[3], _S3, vl); - - vse16_v_f16m1(outptr, _p, vl); - - alphap += 4; - outptr += packn; - } - } - - delete[] buf; - } - - return 0; - } - - if (resize_type == 2) // bilinear - { - int* buf = new int[outw + outw * 2]; - - int* xofs = buf; - __fp16* alpha = (__fp16*)(buf + outw); - - linear_coeffs_fp16sa(w, outw, xofs, alpha, align_corner); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < h; y++) - { - const __fp16* ptr = bottom_blob.row(y); - __fp16* outptr = top_blob.row<__fp16>(y); - const __fp16* alphap = alpha; - - for (int x = 0; x < outw; x++) - { - int sx = xofs[x]; - const __fp16* Sp = ptr + sx; - __fp16 a0 = alphap[0]; - __fp16 a1 = alphap[1]; - *outptr++ = Sp[0] * a0 + Sp[1] * a1; - alphap += 2; - } - } - - delete[] buf; - } - - if (resize_type == 3) // bicubic - { - int* buf = new int[outw + outw * 4]; - - int* xofs = buf; - __fp16* alpha = (__fp16*)(buf + outw); - - cubic_coeffs_fp16sa(w, outw, xofs, alpha, align_corner); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < h; y++) - { - const __fp16* ptr = bottom_blob.row(y); - __fp16* outptr = top_blob.row<__fp16>(y); - const __fp16* alphap = alpha; - - for (int x = 0; x < outw; x++) - { - int sx = xofs[x]; - const __fp16* Sp = ptr + sx; - __fp16 a0 = alphap[0]; - __fp16 a1 = alphap[1]; - __fp16 a2 = alphap[2]; - __fp16 a3 = alphap[3]; - *outptr++ = Sp[-1] * a0 + Sp[0] * a1 + Sp[1] * a2 + Sp[2] * a3; - alphap += 4; - } - } - - delete[] buf; - } - - return 0; - } - - if (outw == w && outh == h) - { - top_blob = bottom_blob; - return 0; - } - - top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - if (elempack == packn) - { - if (resize_type == 2) // bilinear - { - int* buf = new int[outw + outh + outw * 2 + outh * 2]; - - int* xofs = buf; //new int[outw]; - int* yofs = buf + outw; //new int[outh]; - - __fp16* alpha = (__fp16*)(buf + outw + outh); //new __fp16[outw * 2]; - __fp16* beta = (__fp16*)(buf + outw + outh + outw * 2); //new __fp16[outh * 2]; - - linear_coeffs_fp16sa(w, outw, xofs, alpha, align_corner); - linear_coeffs_fp16sa(h, outh, yofs, beta, align_corner); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const Mat src = bottom_blob.channel(q); - Mat dst = top_blob.channel(q); - - resize_bilinear_image_packn_fp16sa(src, dst, alpha, xofs, beta, yofs); - } - - delete[] buf; - } - - if (resize_type == 3) // bicubic - { - int* buf = new int[outw + outh + outw * 4 + outh * 4]; - - int* xofs = buf; //new int[outw]; - int* yofs = buf + outw; //new int[outh]; - - __fp16* alpha = (__fp16*)(buf + outw + outh); //new __fp16[outw * 4]; - __fp16* beta = (__fp16*)(buf + outw + outh + outw * 4); //new __fp16[outh * 4]; - - cubic_coeffs_fp16sa(w, outw, xofs, alpha, align_corner); - cubic_coeffs_fp16sa(h, outh, yofs, beta, align_corner); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const Mat src = bottom_blob.channel(q); - Mat dst = top_blob.channel(q); - - resize_bicubic_image_packn_fp16sa(src, dst, alpha, xofs, beta, yofs); - } - - delete[] buf; - } - - return 0; - } - - if (resize_type == 2) // bilinear - { - int* buf = new int[outw + outh + outw * 2 + outh * 2]; - - int* xofs = buf; //new int[outw]; - int* yofs = buf + outw; //new int[outh]; - - __fp16* alpha = (__fp16*)(buf + outw + outh); //new __fp16[outw * 2]; - __fp16* beta = (__fp16*)(buf + outw + outh + outw * 2); //new __fp16[outh * 2]; - - linear_coeffs_fp16sa(w, outw, xofs, alpha, align_corner); - linear_coeffs_fp16sa(h, outh, yofs, beta, align_corner); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const Mat src = bottom_blob.channel(q); - Mat dst = top_blob.channel(q); - - resize_bilinear_image_fp16sa(src, dst, alpha, xofs, beta, yofs); - } - - delete[] buf; - } - - if (resize_type == 3) // bicubic - { - int* buf = new int[outw + outh + outw * 4 + outh * 4]; - - int* xofs = buf; //new int[outw]; - int* yofs = buf + outw; //new int[outh]; - - __fp16* alpha = (__fp16*)(buf + outw + outh); //new __fp16[outw * 4]; - __fp16* beta = (__fp16*)(buf + outw + outh + outw * 4); //new __fp16[outh * 4]; - - cubic_coeffs_fp16sa(w, outw, xofs, alpha, align_corner); - cubic_coeffs_fp16sa(h, outh, yofs, beta, align_corner); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const Mat src = bottom_blob.channel(q); - Mat dst = top_blob.channel(q); - - resize_bicubic_image_fp16sa(src, dst, alpha, xofs, beta, yofs); - } - - delete[] buf; - } - - return 0; -} -#endif // __riscv_vector && __riscv_zfh - } // namespace ncnn diff --git a/src/layer/riscv/interp_riscv.h b/src/layer/riscv/interp_riscv.h index f479223519b..e9510c4afeb 100644 --- a/src/layer/riscv/interp_riscv.h +++ b/src/layer/riscv/interp_riscv.h @@ -27,7 +27,7 @@ class Interp_riscv : public Interp virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; protected: -#if __riscv_vector && __riscv_zfh +#if NCNN_ZFH int forward_fp16s(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; int forward_fp16sa(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; #endif diff --git a/src/layer/riscv/interp_riscv_zfh.cpp b/src/layer/riscv/interp_riscv_zfh.cpp new file mode 100644 index 00000000000..deb39a7bb58 --- /dev/null +++ b/src/layer/riscv/interp_riscv_zfh.cpp @@ -0,0 +1,775 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "interp_riscv.h" + +#if __riscv_vector +#include +#include "riscv_usability.h" +#endif // __riscv_vector + +namespace ncnn { + +#include "interp_bicubic.h" +#include "interp_bilinear.h" + +#if NCNN_ZFH +#include "interp_bicubic_fp16s.h" +#include "interp_bilinear_fp16s.h" +#if __riscv_zvfh +#include "interp_bicubic_packn_fp16s.h" +#include "interp_bilinear_packn_fp16s.h" +#endif +#endif + +#if NCNN_ZFH +int Interp_riscv::forward_fp16s(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const +{ +#if __riscv_zvfh + const int packn = csrr_vlenb() / 2; +#endif // __riscv_zvfh + + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& reference_blob = bottom_blobs[1]; + Mat& top_blob = top_blobs[0]; + + int h = bottom_blob.h; + int w = bottom_blob.w; + int channels = bottom_blob.c; + int dims = bottom_blob.dims; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + int outw = reference_blob.w; + int outh = reference_blob.h; + + if (dims == 1) + { + top_blob.create(outw, outh, w, elemsize, elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + +#if __riscv_zvfh + if (elempack == packn) + { + const size_t vl = __riscv_vsetvl_e16m1(packn); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < w; q++) + { + Mat top_blob_c = top_blob.channel(q); + vfloat16m1_t _v = __riscv_vle16_v_f16m1((const __fp16*)bottom_blob + q * packn, vl); + top_blob_c.fill(_v); + } + + return 0; + } +#endif // __riscv_zvfh + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < w; q++) + { + Mat top_blob_c = top_blob.channel(q); + const __fp16* ptr = bottom_blob; + top_blob_c.fill(ptr[q]); + } + + return 0; + } + + if (dims == 2) + { + if (outw == w) + { + top_blob = bottom_blob; + return 0; + } + + top_blob.create(outw, h, elemsize, elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + +#if __riscv_zvfh + if (elempack == packn) + { + if (resize_type == 1) // nearest + { + const size_t vl = __riscv_vsetvl_e16m1(packn); + + const float ws = output_width ? w / (float)outw : 1.f / width_scale; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < h; y++) + { + const __fp16* ptr = bottom_blob.row(y); + __fp16* outptr = top_blob.row<__fp16>(y); + for (int x = 0; x < outw; x++) + { + int in_x = std::min((int)(x * ws), (w - 1)); + + vfloat16m1_t _p = __riscv_vle16_v_f16m1(ptr + in_x * packn, vl); + __riscv_vse16_v_f16m1(outptr, _p, vl); + + outptr += packn; + } + } + } + + if (resize_type == 2) // bilinear + { + const size_t vl = __riscv_vsetvl_e16m1(packn); + + int* buf = new int[outw + outw * packn]; + + int* xofs = buf; + float* alpha = (float*)(buf + outw); + + linear_coeffs(w, outw, xofs, alpha, align_corner); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < h; y++) + { + const __fp16* ptr = bottom_blob.row(y); + __fp16* outptr = top_blob.row<__fp16>(y); + const float* alphap = alpha; + + for (int x = 0; x < outw; x++) + { + int sx = xofs[x] * packn; + const __fp16* Sp = ptr + sx; + + vfloat16m1_t _S0 = __riscv_vle16_v_f16m1(Sp, vl); + vfloat16m1_t _S1 = __riscv_vle16_v_f16m1(Sp + packn, vl); + vfloat32m2_t _p = __riscv_vfwmacc_vf_f32m2(__riscv_vfwmul_vf_f32m2(_S0, alphap[0], vl), alphap[1], _S1, vl); + + __riscv_vse16_v_f16m1(outptr, __riscv_vfncvt_f_f_w_f16m1(_p, vl), vl); + + alphap += 2; + outptr += packn; + } + } + + delete[] buf; + } + + if (resize_type == 3) // bicubic + { + const size_t vl = __riscv_vsetvl_e16m1(packn); + + int* buf = new int[outw + outw * packn]; + + int* xofs = buf; + float* alpha = (float*)(buf + outw); + + cubic_coeffs(w, outw, xofs, alpha, align_corner); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < h; y++) + { + const __fp16* ptr = bottom_blob.row(y); + __fp16* outptr = top_blob.row<__fp16>(y); + const float* alphap = alpha; + + for (int x = 0; x < outw; x++) + { + int sx = xofs[x] * packn; + const __fp16* Sp = ptr + sx; + + vfloat16m1_t _S0 = __riscv_vle16_v_f16m1(Sp - packn, vl); + vfloat16m1_t _S1 = __riscv_vle16_v_f16m1(Sp, vl); + vfloat16m1_t _S2 = __riscv_vle16_v_f16m1(Sp + packn, vl); + vfloat16m1_t _S3 = __riscv_vle16_v_f16m1(Sp + packn * 2, vl); + vfloat32m2_t _p = __riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmacc_vf_f32m2(__riscv_vfwmul_vf_f32m2(_S0, alphap[0], vl), alphap[1], _S1, vl), alphap[2], _S2, vl), alphap[3], _S3, vl); + + __riscv_vse16_v_f16m1(outptr, __riscv_vfncvt_f_f_w_f16m1(_p, vl), vl); + + alphap += 4; + outptr += packn; + } + } + + delete[] buf; + } + + return 0; + } +#endif // __riscv_zvfh + + if (resize_type == 1) // nearest + { + const float ws = output_width ? w / (float)outw : 1.f / width_scale; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < h; y++) + { + const __fp16* ptr = bottom_blob.row(y); + __fp16* outptr = top_blob.row<__fp16>(y); + for (int x = 0; x < outw; x++) + { + int in_x = std::min((int)(x * ws), (w - 1)); + *outptr++ = ptr[in_x]; + } + } + } + + if (resize_type == 2) // bilinear + { + int* buf = new int[outw + outw * 2]; + + int* xofs = buf; + float* alpha = (float*)(buf + outw); + + linear_coeffs(w, outw, xofs, alpha, align_corner); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < h; y++) + { + const __fp16* ptr = bottom_blob.row(y); + __fp16* outptr = top_blob.row<__fp16>(y); + const float* alphap = alpha; + + for (int x = 0; x < outw; x++) + { + int sx = xofs[x]; + const __fp16* Sp = ptr + sx; + float a0 = alphap[0]; + float a1 = alphap[1]; + *outptr++ = (__fp16)((float)Sp[0] * a0 + (float)Sp[1] * a1); + alphap += 2; + } + } + + delete[] buf; + } + + if (resize_type == 3) // bicubic + { + int* buf = new int[outw + outw * 4]; + + int* xofs = buf; + float* alpha = (float*)(buf + outw); + + cubic_coeffs(w, outw, xofs, alpha, align_corner); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < h; y++) + { + const __fp16* ptr = bottom_blob.row(y); + __fp16* outptr = top_blob.row<__fp16>(y); + const float* alphap = alpha; + + for (int x = 0; x < outw; x++) + { + int sx = xofs[x]; + const __fp16* Sp = ptr + sx; + float a0 = alphap[0]; + float a1 = alphap[1]; + float a2 = alphap[2]; + float a3 = alphap[3]; + *outptr++ = (__fp16)((float)Sp[-1] * a0 + (float)Sp[0] * a1 + (float)Sp[1] * a2 + (float)Sp[2] * a3); + alphap += 4; + } + } + + delete[] buf; + } + + return 0; + } + + if (outw == w && outh == h) + { + top_blob = bottom_blob; + return 0; + } + + top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + +#if __riscv_zvfh + if (elempack == packn) + { + if (resize_type == 1) // nearest + { + const size_t vl = __riscv_vsetvl_e16m1(packn); + + const float hs = output_height ? h / (float)outh : 1.f / height_scale; + const float ws = output_width ? w / (float)outw : 1.f / width_scale; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const Mat src = bottom_blob.channel(q); + Mat dst = top_blob.channel(q); + + for (int y = 0; y < outh; y++) + { + int in_y = std::min((int)(y * hs), (h - 1)); + + const __fp16* ptr = src.row(in_y); + __fp16* outptr = dst.row<__fp16>(y); + for (int x = 0; x < outw; x++) + { + int in_x = std::min((int)(x * ws), (w - 1)); + + vfloat16m1_t _p = __riscv_vle16_v_f16m1(ptr + in_x * packn, vl); + __riscv_vse16_v_f16m1(outptr, _p, vl); + + outptr += packn; + } + } + } + } + + if (resize_type == 2) // bilinear + { + int* buf = new int[outw + outh + outw * 2 + outh * 2]; + + int* xofs = buf; //new int[outw]; + int* yofs = buf + outw; //new int[outh]; + + float* alpha = (float*)(buf + outw + outh); //new float[outw * 2]; + float* beta = (float*)(buf + outw + outh + outw * 2); //new float[outh * 2]; + + linear_coeffs(w, outw, xofs, alpha, align_corner); + linear_coeffs(h, outh, yofs, beta, align_corner); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const Mat src = bottom_blob.channel(q); + Mat dst = top_blob.channel(q); + + resize_bilinear_image_packn_fp16s(src, dst, alpha, xofs, beta, yofs); + } + + delete[] buf; + } + + if (resize_type == 3) // bicubic + { + int* buf = new int[outw + outh + outw * 4 + outh * 4]; + + int* xofs = buf; //new int[outw]; + int* yofs = buf + outw; //new int[outh]; + + float* alpha = (float*)(buf + outw + outh); //new float[outw * 4]; + float* beta = (float*)(buf + outw + outh + outw * 4); //new float[outh * 4]; + + cubic_coeffs(w, outw, xofs, alpha, align_corner); + cubic_coeffs(h, outh, yofs, beta, align_corner); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const Mat src = bottom_blob.channel(q); + Mat dst = top_blob.channel(q); + + resize_bicubic_image_packn_fp16s(src, dst, alpha, xofs, beta, yofs); + } + + delete[] buf; + } + + return 0; + } +#endif // __riscv_zvfh + + if (resize_type == 1) // nearest + { + const float hs = output_height ? h / (float)outh : 1.f / height_scale; + const float ws = output_width ? w / (float)outw : 1.f / width_scale; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const Mat src = bottom_blob.channel(q); + Mat dst = top_blob.channel(q); + + for (int y = 0; y < outh; y++) + { + int in_y = std::min((int)(y * hs), (h - 1)); + + const __fp16* ptr = src.row(in_y); + __fp16* outptr = dst.row<__fp16>(y); + for (int x = 0; x < outw; x++) + { + int in_x = std::min((int)(x * ws), (w - 1)); + *outptr++ = ptr[in_x]; + } + } + } + } + + if (resize_type == 2) // bilinear + { + int* buf = new int[outw + outh + outw * 2 + outh * 2]; + + int* xofs = buf; //new int[outw]; + int* yofs = buf + outw; //new int[outh]; + + float* alpha = (float*)(buf + outw + outh); //new float[outw * 2]; + float* beta = (float*)(buf + outw + outh + outw * 2); //new float[outh * 2]; + + linear_coeffs(w, outw, xofs, alpha, align_corner); + linear_coeffs(h, outh, yofs, beta, align_corner); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const Mat src = bottom_blob.channel(q); + Mat dst = top_blob.channel(q); + + resize_bilinear_image_fp16s(src, dst, alpha, xofs, beta, yofs); + } + + delete[] buf; + } + + if (resize_type == 3) // bicubic + { + int* buf = new int[outw + outh + outw * 4 + outh * 4]; + + int* xofs = buf; //new int[outw]; + int* yofs = buf + outw; //new int[outh]; + + float* alpha = (float*)(buf + outw + outh); //new float[outw * 4]; + float* beta = (float*)(buf + outw + outh + outw * 4); //new float[outh * 4]; + + cubic_coeffs(w, outw, xofs, alpha, align_corner); + cubic_coeffs(h, outh, yofs, beta, align_corner); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const Mat src = bottom_blob.channel(q); + Mat dst = top_blob.channel(q); + + resize_bicubic_image_fp16s(src, dst, alpha, xofs, beta, yofs); + } + + delete[] buf; + } + + return 0; +} + +int Interp_riscv::forward_fp16sa(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const +{ +#if __riscv_zvfh + const int packn = csrr_vlenb() / 2; +#endif // __riscv_zvfh + + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& reference_blob = bottom_blobs[1]; + Mat& top_blob = top_blobs[0]; + + int h = bottom_blob.h; + int w = bottom_blob.w; + int channels = bottom_blob.c; + int dims = bottom_blob.dims; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + int outw = reference_blob.w; + int outh = reference_blob.h; + + if (dims == 1 || resize_type == 1) // nearest + { + return forward_fp16s(bottom_blobs, top_blobs, opt); + } + + if (dims == 2) + { + if (outw == w) + { + top_blob = bottom_blob; + return 0; + } + + top_blob.create(outw, h, elemsize, elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + +#if __riscv_zvfh + if (elempack == packn) + { + if (resize_type == 2) // bilinear + { + const size_t vl = __riscv_vsetvl_e16m1(packn); + + int* buf = new int[outw + outw * packn]; + + int* xofs = buf; + __fp16* alpha = (__fp16*)(buf + outw); + + linear_coeffs_fp16sa(w, outw, xofs, alpha, align_corner); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < h; y++) + { + const __fp16* ptr = bottom_blob.row(y); + __fp16* outptr = top_blob.row<__fp16>(y); + const __fp16* alphap = alpha; + + for (int x = 0; x < outw; x++) + { + int sx = xofs[x] * packn; + const __fp16* Sp = ptr + sx; + + vfloat16m1_t _S0 = __riscv_vle16_v_f16m1(Sp, vl); + vfloat16m1_t _S1 = __riscv_vle16_v_f16m1(Sp + packn, vl); + vfloat16m1_t _p = __riscv_vfmacc_vf_f16m1(__riscv_vfmul_vf_f16m1(_S0, alphap[0], vl), alphap[1], _S1, vl); + + __riscv_vse16_v_f16m1(outptr, _p, vl); + + alphap += 2; + outptr += packn; + } + } + + delete[] buf; + } + + if (resize_type == 3) // bicubic + { + const size_t vl = __riscv_vsetvl_e16m1(packn); + + int* buf = new int[outw + outw * packn]; + + int* xofs = buf; + __fp16* alpha = (__fp16*)(buf + outw); + + cubic_coeffs_fp16sa(w, outw, xofs, alpha, align_corner); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < h; y++) + { + const __fp16* ptr = bottom_blob.row(y); + __fp16* outptr = top_blob.row<__fp16>(y); + const __fp16* alphap = alpha; + + for (int x = 0; x < outw; x++) + { + int sx = xofs[x] * packn; + const __fp16* Sp = ptr + sx; + + vfloat16m1_t _S0 = __riscv_vle16_v_f16m1(Sp - packn, vl); + vfloat16m1_t _S1 = __riscv_vle16_v_f16m1(Sp, vl); + vfloat16m1_t _S2 = __riscv_vle16_v_f16m1(Sp + packn, vl); + vfloat16m1_t _S3 = __riscv_vle16_v_f16m1(Sp + packn * 2, vl); + vfloat16m1_t _p = __riscv_vfmacc_vf_f16m1(__riscv_vfmacc_vf_f16m1(__riscv_vfmacc_vf_f16m1(__riscv_vfmul_vf_f16m1(_S0, alphap[0], vl), alphap[1], _S1, vl), alphap[2], _S2, vl), alphap[3], _S3, vl); + + __riscv_vse16_v_f16m1(outptr, _p, vl); + + alphap += 4; + outptr += packn; + } + } + + delete[] buf; + } + + return 0; + } +#endif // __riscv_zvfh + + if (resize_type == 2) // bilinear + { + int* buf = new int[outw + outw * 2]; + + int* xofs = buf; + __fp16* alpha = (__fp16*)(buf + outw); + + linear_coeffs_fp16sa(w, outw, xofs, alpha, align_corner); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < h; y++) + { + const __fp16* ptr = bottom_blob.row(y); + __fp16* outptr = top_blob.row<__fp16>(y); + const __fp16* alphap = alpha; + + for (int x = 0; x < outw; x++) + { + int sx = xofs[x]; + const __fp16* Sp = ptr + sx; + __fp16 a0 = alphap[0]; + __fp16 a1 = alphap[1]; + *outptr++ = Sp[0] * a0 + Sp[1] * a1; + alphap += 2; + } + } + + delete[] buf; + } + + if (resize_type == 3) // bicubic + { + int* buf = new int[outw + outw * 4]; + + int* xofs = buf; + __fp16* alpha = (__fp16*)(buf + outw); + + cubic_coeffs_fp16sa(w, outw, xofs, alpha, align_corner); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < h; y++) + { + const __fp16* ptr = bottom_blob.row(y); + __fp16* outptr = top_blob.row<__fp16>(y); + const __fp16* alphap = alpha; + + for (int x = 0; x < outw; x++) + { + int sx = xofs[x]; + const __fp16* Sp = ptr + sx; + __fp16 a0 = alphap[0]; + __fp16 a1 = alphap[1]; + __fp16 a2 = alphap[2]; + __fp16 a3 = alphap[3]; + *outptr++ = Sp[-1] * a0 + Sp[0] * a1 + Sp[1] * a2 + Sp[2] * a3; + alphap += 4; + } + } + + delete[] buf; + } + + return 0; + } + + if (outw == w && outh == h) + { + top_blob = bottom_blob; + return 0; + } + + top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + +#if __riscv_zvfh + if (elempack == packn) + { + if (resize_type == 2) // bilinear + { + int* buf = new int[outw + outh + outw * 2 + outh * 2]; + + int* xofs = buf; //new int[outw]; + int* yofs = buf + outw; //new int[outh]; + + __fp16* alpha = (__fp16*)(buf + outw + outh); //new __fp16[outw * 2]; + __fp16* beta = (__fp16*)(buf + outw + outh + outw * 2); //new __fp16[outh * 2]; + + linear_coeffs_fp16sa(w, outw, xofs, alpha, align_corner); + linear_coeffs_fp16sa(h, outh, yofs, beta, align_corner); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const Mat src = bottom_blob.channel(q); + Mat dst = top_blob.channel(q); + + resize_bilinear_image_packn_fp16sa(src, dst, alpha, xofs, beta, yofs); + } + + delete[] buf; + } + + if (resize_type == 3) // bicubic + { + int* buf = new int[outw + outh + outw * 4 + outh * 4]; + + int* xofs = buf; //new int[outw]; + int* yofs = buf + outw; //new int[outh]; + + __fp16* alpha = (__fp16*)(buf + outw + outh); //new __fp16[outw * 4]; + __fp16* beta = (__fp16*)(buf + outw + outh + outw * 4); //new __fp16[outh * 4]; + + cubic_coeffs_fp16sa(w, outw, xofs, alpha, align_corner); + cubic_coeffs_fp16sa(h, outh, yofs, beta, align_corner); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const Mat src = bottom_blob.channel(q); + Mat dst = top_blob.channel(q); + + resize_bicubic_image_packn_fp16sa(src, dst, alpha, xofs, beta, yofs); + } + + delete[] buf; + } + + return 0; + } +#endif // __riscv_zvfh + + if (resize_type == 2) // bilinear + { + int* buf = new int[outw + outh + outw * 2 + outh * 2]; + + int* xofs = buf; //new int[outw]; + int* yofs = buf + outw; //new int[outh]; + + __fp16* alpha = (__fp16*)(buf + outw + outh); //new __fp16[outw * 2]; + __fp16* beta = (__fp16*)(buf + outw + outh + outw * 2); //new __fp16[outh * 2]; + + linear_coeffs_fp16sa(w, outw, xofs, alpha, align_corner); + linear_coeffs_fp16sa(h, outh, yofs, beta, align_corner); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const Mat src = bottom_blob.channel(q); + Mat dst = top_blob.channel(q); + + resize_bilinear_image_fp16sa(src, dst, alpha, xofs, beta, yofs); + } + + delete[] buf; + } + + if (resize_type == 3) // bicubic + { + int* buf = new int[outw + outh + outw * 4 + outh * 4]; + + int* xofs = buf; //new int[outw]; + int* yofs = buf + outw; //new int[outh]; + + __fp16* alpha = (__fp16*)(buf + outw + outh); //new __fp16[outw * 4]; + __fp16* beta = (__fp16*)(buf + outw + outh + outw * 4); //new __fp16[outh * 4]; + + cubic_coeffs_fp16sa(w, outw, xofs, alpha, align_corner); + cubic_coeffs_fp16sa(h, outh, yofs, beta, align_corner); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const Mat src = bottom_blob.channel(q); + Mat dst = top_blob.channel(q); + + resize_bicubic_image_fp16sa(src, dst, alpha, xofs, beta, yofs); + } + + delete[] buf; + } + + return 0; +} +#endif // NCNN_ZFH + +} // namespace ncnn diff --git a/src/layer/riscv/mish_riscv.cpp b/src/layer/riscv/mish_riscv.cpp index 57b17d3a732..f1e8e477bd0 100644 --- a/src/layer/riscv/mish_riscv.cpp +++ b/src/layer/riscv/mish_riscv.cpp @@ -17,24 +17,29 @@ #if __riscv_vector #include #include "rvv_mathfun.h" -#include "rvv_mathfun_fp16s.h" #endif // __riscv_vector +#include "cpu.h" + namespace ncnn { Mish_riscv::Mish_riscv() { #if __riscv_vector support_packing = true; -#if __riscv_zfh - support_fp16_storage = true; -#endif #endif // __riscv_vector +#if NCNN_ZFH +#if __riscv_vector + support_fp16_storage = cpu_support_riscv_zvfh(); +#else + support_fp16_storage = cpu_support_riscv_zfh(); +#endif +#endif } int Mish_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const { -#if __riscv_vector && __riscv_zfh +#if NCNN_ZFH int elembits = bottom_top_blob.elembits(); if (opt.use_fp16_storage && elembits == 16) @@ -62,11 +67,11 @@ int Mish_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const int n = size; while (n > 0) { - size_t vl = vsetvl_e32m8(n); + size_t vl = __riscv_vsetvl_e32m8(n); - vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); - _p = vfmul_vv_f32m8(_p, tanh_ps(log_ps(vfadd_vf_f32m8(exp_ps(_p, vl), 1.f, vl), vl), vl), vl); - vse32_v_f32m8(ptr, _p, vl); + vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr, vl); + _p = __riscv_vfmul_vv_f32m8(_p, tanh_ps(log_ps(__riscv_vfadd_vf_f32m8(exp_ps(_p, vl), 1.f, vl), vl), vl), vl); + __riscv_vse32_v_f32m8(ptr, _p, vl); ptr += vl; n -= vl; @@ -83,68 +88,4 @@ int Mish_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const return 0; } -#if __riscv_vector && __riscv_zfh -int Mish_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const -{ - int w = bottom_top_blob.w; - int h = bottom_top_blob.h; - int d = bottom_top_blob.d; - int channels = bottom_top_blob.c; - int elempack = bottom_top_blob.elempack; - int size = w * h * d * elempack; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - __fp16* ptr = bottom_top_blob.channel(q); - - int n = size; - while (n > 0) - { - size_t vl = vsetvl_e16m4(n); - - vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr, vl), vl); - _p = vfmul_vv_f32m8(_p, tanh_ps(log_ps(vfadd_vf_f32m8(exp_ps(_p, vl), 1.f, vl), vl), vl), vl); - vse16_v_f16m4(ptr, vfncvt_f_f_w_f16m4(_p, vl), vl); - - ptr += vl; - n -= vl; - } - } - - return 0; -} - -int Mish_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const -{ - int w = bottom_top_blob.w; - int h = bottom_top_blob.h; - int d = bottom_top_blob.d; - int channels = bottom_top_blob.c; - int elempack = bottom_top_blob.elempack; - int size = w * h * d * elempack; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - __fp16* ptr = bottom_top_blob.channel(q); - - int n = size; - while (n > 0) - { - size_t vl = vsetvl_e16m8(n); - - vfloat16m8_t _p = vle16_v_f16m8(ptr, vl); - _p = vfmul_vv_f16m8(_p, tanh_ps(log_ps(vfadd_vf_f16m8(exp_ps(_p, vl), 1.f, vl), vl), vl), vl); - vse16_v_f16m8(ptr, _p, vl); - - ptr += vl; - n -= vl; - } - } - - return 0; -} -#endif // __riscv_vector && __riscv_zfh - } // namespace ncnn diff --git a/src/layer/riscv/mish_riscv.h b/src/layer/riscv/mish_riscv.h index 2e2be1a2b44..a7dc62018a9 100644 --- a/src/layer/riscv/mish_riscv.h +++ b/src/layer/riscv/mish_riscv.h @@ -27,7 +27,7 @@ class Mish_riscv : public Mish virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; protected: -#if __riscv_vector && __riscv_zfh +#if NCNN_ZFH int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const; int forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const; #endif diff --git a/src/layer/riscv/mish_riscv_zfh.cpp b/src/layer/riscv/mish_riscv_zfh.cpp new file mode 100644 index 00000000000..6f614b20abb --- /dev/null +++ b/src/layer/riscv/mish_riscv_zfh.cpp @@ -0,0 +1,108 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "mish_riscv.h" + +#if __riscv_vector +#include +#include "rvv_mathfun.h" +#if __riscv_zvfh +#include "rvv_mathfun_fp16s.h" +#endif +#endif // __riscv_vector + +namespace ncnn { + +#if NCNN_ZFH +int Mish_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const +{ + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + int d = bottom_top_blob.d; + int channels = bottom_top_blob.c; + int elempack = bottom_top_blob.elempack; + int size = w * h * d * elempack; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + __fp16* ptr = bottom_top_blob.channel(q); + +#if __riscv_zvfh + int n = size; + while (n > 0) + { + size_t vl = __riscv_vsetvl_e16m4(n); + + vfloat32m8_t _p = __riscv_vfwcvt_f_f_v_f32m8(__riscv_vle16_v_f16m4(ptr, vl), vl); + _p = __riscv_vfmul_vv_f32m8(_p, tanh_ps(log_ps(__riscv_vfadd_vf_f32m8(exp_ps(_p, vl), (__fp16)1.f, vl), vl), vl), vl); + __riscv_vse16_v_f16m4(ptr, __riscv_vfncvt_f_f_w_f16m4(_p, vl), vl); + + ptr += vl; + n -= vl; + } +#else // __riscv_zvfh + for (int i = 0; i < size; i++) + { + float v = (float)*ptr; + *ptr = (__fp16)(v * tanh(log(exp(v) + 1.f))); + ptr++; + } +#endif // __riscv_zvfh + } + + return 0; +} + +int Mish_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const +{ + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + int d = bottom_top_blob.d; + int channels = bottom_top_blob.c; + int elempack = bottom_top_blob.elempack; + int size = w * h * d * elempack; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + __fp16* ptr = bottom_top_blob.channel(q); + +#if __riscv_zvfh + int n = size; + while (n > 0) + { + size_t vl = __riscv_vsetvl_e16m8(n); + + vfloat16m8_t _p = __riscv_vle16_v_f16m8(ptr, vl); + _p = __riscv_vfmul_vv_f16m8(_p, tanh_ps(log_ps(__riscv_vfadd_vf_f16m8(exp_ps(_p, vl), (__fp16)1.f, vl), vl), vl), vl); + __riscv_vse16_v_f16m8(ptr, _p, vl); + + ptr += vl; + n -= vl; + } +#else // __riscv_zvfh + for (int i = 0; i < size; i++) + { + *ptr = *ptr * (__fp16)tanh(log(exp((float)*ptr) + 1.f)); + ptr++; + } +#endif // __riscv_zvfh + } + + return 0; +} +#endif // NCNN_ZFH + +} // namespace ncnn diff --git a/src/layer/riscv/packing_riscv.cpp b/src/layer/riscv/packing_riscv.cpp index 5c298da522d..aef5522d080 100644 --- a/src/layer/riscv/packing_riscv.cpp +++ b/src/layer/riscv/packing_riscv.cpp @@ -20,13 +20,19 @@ #include "riscv_usability.h" +#include "cpu.h" + namespace ncnn { Packing_riscv::Packing_riscv() { support_packing = true; -#if __riscv_zfh - support_fp16_storage = true; +#if NCNN_ZFH +#if __riscv_vector + support_fp16_storage = cpu_support_riscv_zvfh(); +#else + support_fp16_storage = cpu_support_riscv_zfh(); +#endif #endif support_bf16_storage = true; } @@ -38,7 +44,7 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& if (elembits == 8) return forward_int8(bottom_blob, top_blob, opt); -#if __riscv_zfh +#if NCNN_ZFH if (opt.use_fp16_storage && elembits == 16) return forward_bf16s_fp16s(bottom_blob, top_blob, opt); #endif @@ -139,13 +145,13 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& int n = w; while (n > 0) { - size_t vl = vsetvl_e32m2(n); + size_t vl = __riscv_vsetvl_e32m2(n); - vfloat32m2_t _p0 = vle32_v_f32m2(r0, vl); - vfloat32m2_t _p1 = vle32_v_f32m2(r1, vl); - vfloat32m2_t _p2 = vle32_v_f32m2(r2, vl); - vfloat32m2_t _p3 = vle32_v_f32m2(r3, vl); - vsseg4e32_v_f32m2(outptr, _p0, _p1, _p2, _p3, vl); + vfloat32m2_t _p0 = __riscv_vle32_v_f32m2(r0, vl); + vfloat32m2_t _p1 = __riscv_vle32_v_f32m2(r1, vl); + vfloat32m2_t _p2 = __riscv_vle32_v_f32m2(r2, vl); + vfloat32m2_t _p3 = __riscv_vle32_v_f32m2(r3, vl); + __riscv_vsseg4e32_v_f32m2x4(outptr, __riscv_vcreate_v_f32m2x4(_p0, _p1, _p2, _p3), vl); r0 += vl; r1 += vl; @@ -183,18 +189,14 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& int n = w; while (n > 0) { - size_t vl = vsetvl_e32m2(n); + size_t vl = __riscv_vsetvl_e32m2(n); - vfloat32m2_t _p0; - vfloat32m2_t _p1; - vfloat32m2_t _p2; - vfloat32m2_t _p3; - vlseg4e32_v_f32m2(&_p0, &_p1, &_p2, &_p3, r0, vl); + vfloat32m2x4_t _p = __riscv_vlseg4e32_v_f32m2x4(r0, vl); - vse32_v_f32m2(outptr0, _p0, vl); - vse32_v_f32m2(outptr1, _p1, vl); - vse32_v_f32m2(outptr2, _p2, vl); - vse32_v_f32m2(outptr3, _p3, vl); + __riscv_vse32_v_f32m2(outptr0, __riscv_vget_v_f32m2x4_f32m2(_p, 0), vl); + __riscv_vse32_v_f32m2(outptr1, __riscv_vget_v_f32m2x4_f32m2(_p, 1), vl); + __riscv_vse32_v_f32m2(outptr2, __riscv_vget_v_f32m2x4_f32m2(_p, 2), vl); + __riscv_vse32_v_f32m2(outptr3, __riscv_vget_v_f32m2x4_f32m2(_p, 3), vl); r0 += vl * 4; outptr0 += vl; @@ -236,17 +238,17 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& int n = w; while (n > 0) { - size_t vl = vsetvl_e32m1(n); - - vfloat32m1_t _p0 = vle32_v_f32m1(r0, vl); - vfloat32m1_t _p1 = vle32_v_f32m1(r1, vl); - vfloat32m1_t _p2 = vle32_v_f32m1(r2, vl); - vfloat32m1_t _p3 = vle32_v_f32m1(r3, vl); - vfloat32m1_t _p4 = vle32_v_f32m1(r4, vl); - vfloat32m1_t _p5 = vle32_v_f32m1(r5, vl); - vfloat32m1_t _p6 = vle32_v_f32m1(r6, vl); - vfloat32m1_t _p7 = vle32_v_f32m1(r7, vl); - vsseg8e32_v_f32m1(outptr, _p0, _p1, _p2, _p3, _p4, _p5, _p6, _p7, vl); + size_t vl = __riscv_vsetvl_e32m1(n); + + vfloat32m1_t _p0 = __riscv_vle32_v_f32m1(r0, vl); + vfloat32m1_t _p1 = __riscv_vle32_v_f32m1(r1, vl); + vfloat32m1_t _p2 = __riscv_vle32_v_f32m1(r2, vl); + vfloat32m1_t _p3 = __riscv_vle32_v_f32m1(r3, vl); + vfloat32m1_t _p4 = __riscv_vle32_v_f32m1(r4, vl); + vfloat32m1_t _p5 = __riscv_vle32_v_f32m1(r5, vl); + vfloat32m1_t _p6 = __riscv_vle32_v_f32m1(r6, vl); + vfloat32m1_t _p7 = __riscv_vle32_v_f32m1(r7, vl); + __riscv_vsseg8e32_v_f32m1x8(outptr, __riscv_vcreate_v_f32m1x8(_p0, _p1, _p2, _p3, _p4, _p5, _p6, _p7), vl); r0 += vl; r1 += vl; @@ -296,25 +298,17 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& int n = w; while (n > 0) { - size_t vl = vsetvl_e32m1(n); - - vfloat32m1_t _p0; - vfloat32m1_t _p1; - vfloat32m1_t _p2; - vfloat32m1_t _p3; - vfloat32m1_t _p4; - vfloat32m1_t _p5; - vfloat32m1_t _p6; - vfloat32m1_t _p7; - vlseg8e32_v_f32m1(&_p0, &_p1, &_p2, &_p3, &_p4, &_p5, &_p6, &_p7, r0, vl); - vse32_v_f32m1(outptr0, _p0, vl); - vse32_v_f32m1(outptr1, _p1, vl); - vse32_v_f32m1(outptr2, _p2, vl); - vse32_v_f32m1(outptr3, _p3, vl); - vse32_v_f32m1(outptr4, _p4, vl); - vse32_v_f32m1(outptr5, _p5, vl); - vse32_v_f32m1(outptr6, _p6, vl); - vse32_v_f32m1(outptr7, _p7, vl); + size_t vl = __riscv_vsetvl_e32m1(n); + + vfloat32m1x8_t _p = __riscv_vlseg8e32_v_f32m1x8(r0, vl); + __riscv_vse32_v_f32m1(outptr0, __riscv_vget_v_f32m1x8_f32m1(_p, 0), vl); + __riscv_vse32_v_f32m1(outptr1, __riscv_vget_v_f32m1x8_f32m1(_p, 1), vl); + __riscv_vse32_v_f32m1(outptr2, __riscv_vget_v_f32m1x8_f32m1(_p, 2), vl); + __riscv_vse32_v_f32m1(outptr3, __riscv_vget_v_f32m1x8_f32m1(_p, 3), vl); + __riscv_vse32_v_f32m1(outptr4, __riscv_vget_v_f32m1x8_f32m1(_p, 4), vl); + __riscv_vse32_v_f32m1(outptr5, __riscv_vget_v_f32m1x8_f32m1(_p, 5), vl); + __riscv_vse32_v_f32m1(outptr6, __riscv_vget_v_f32m1x8_f32m1(_p, 6), vl); + __riscv_vse32_v_f32m1(outptr7, __riscv_vget_v_f32m1x8_f32m1(_p, 7), vl); r0 += vl * 8; outptr0 += vl; @@ -358,21 +352,21 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& int n = w; while (n > 0) { - size_t vl = vsetvl_e32m1(n); + size_t vl = __riscv_vsetvl_e32m1(n); - vfloat32m1_t _p00; - vfloat32m1_t _p01; - vfloat32m1_t _p02; - vfloat32m1_t _p03; - vlseg4e32_v_f32m1(&_p00, &_p01, &_p02, &_p03, r0, vl); + vfloat32m1x4_t _p0 = __riscv_vlseg4e32_v_f32m1x4(r0, vl); + vfloat32m1x4_t _p1 = __riscv_vlseg4e32_v_f32m1x4(r1, vl); - vfloat32m1_t _p10; - vfloat32m1_t _p11; - vfloat32m1_t _p12; - vfloat32m1_t _p13; - vlseg4e32_v_f32m1(&_p10, &_p11, &_p12, &_p13, r1, vl); + vfloat32m1_t _p00 = __riscv_vget_v_f32m1x4_f32m1(_p0, 0); + vfloat32m1_t _p01 = __riscv_vget_v_f32m1x4_f32m1(_p0, 1); + vfloat32m1_t _p02 = __riscv_vget_v_f32m1x4_f32m1(_p0, 2); + vfloat32m1_t _p03 = __riscv_vget_v_f32m1x4_f32m1(_p0, 3); + vfloat32m1_t _p10 = __riscv_vget_v_f32m1x4_f32m1(_p1, 0); + vfloat32m1_t _p11 = __riscv_vget_v_f32m1x4_f32m1(_p1, 1); + vfloat32m1_t _p12 = __riscv_vget_v_f32m1x4_f32m1(_p1, 2); + vfloat32m1_t _p13 = __riscv_vget_v_f32m1x4_f32m1(_p1, 3); - vsseg8e32_v_f32m1(outptr, _p00, _p01, _p02, _p03, _p10, _p11, _p12, _p13, vl); + __riscv_vsseg8e32_v_f32m1x8(outptr, __riscv_vcreate_v_f32m1x8(_p00, _p01, _p02, _p03, _p10, _p11, _p12, _p13), vl); r0 += vl * 4; r1 += vl * 4; @@ -412,19 +406,21 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& int n = w; while (n > 0) { - size_t vl = vsetvl_e32m1(n); - - vfloat32m1_t _p0; - vfloat32m1_t _p1; - vfloat32m1_t _p2; - vfloat32m1_t _p3; - vfloat32m1_t _p4; - vfloat32m1_t _p5; - vfloat32m1_t _p6; - vfloat32m1_t _p7; - vlseg8e32_v_f32m1(&_p0, &_p1, &_p2, &_p3, &_p4, &_p5, &_p6, &_p7, r0, vl); - vsseg4e32_v_f32m1(outptr0, _p0, _p1, _p2, _p3, vl); - vsseg4e32_v_f32m1(outptr1, _p4, _p5, _p6, _p7, vl); + size_t vl = __riscv_vsetvl_e32m1(n); + + vfloat32m1x8_t _p = __riscv_vlseg8e32_v_f32m1x8(r0, vl); + + vfloat32m1_t _p0 = __riscv_vget_v_f32m1x8_f32m1(_p, 0); + vfloat32m1_t _p1 = __riscv_vget_v_f32m1x8_f32m1(_p, 1); + vfloat32m1_t _p2 = __riscv_vget_v_f32m1x8_f32m1(_p, 2); + vfloat32m1_t _p3 = __riscv_vget_v_f32m1x8_f32m1(_p, 3); + vfloat32m1_t _p4 = __riscv_vget_v_f32m1x8_f32m1(_p, 4); + vfloat32m1_t _p5 = __riscv_vget_v_f32m1x8_f32m1(_p, 5); + vfloat32m1_t _p6 = __riscv_vget_v_f32m1x8_f32m1(_p, 6); + vfloat32m1_t _p7 = __riscv_vget_v_f32m1x8_f32m1(_p, 7); + + __riscv_vsseg4e32_v_f32m1x4(outptr0, __riscv_vcreate_v_f32m1x4(_p0, _p1, _p2, _p3), vl); + __riscv_vsseg4e32_v_f32m1x4(outptr1, __riscv_vcreate_v_f32m1x4(_p4, _p5, _p6, _p7), vl); r0 += vl * 8; outptr0 += vl * 4; @@ -483,13 +479,14 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& int n = size; while (n > 0) { - size_t vl = vsetvl_e32m2(n); + size_t vl = __riscv_vsetvl_e32m2(n); + + vfloat32m2_t _p0 = __riscv_vle32_v_f32m2(r0, vl); + vfloat32m2_t _p1 = __riscv_vle32_v_f32m2(r1, vl); + vfloat32m2_t _p2 = __riscv_vle32_v_f32m2(r2, vl); + vfloat32m2_t _p3 = __riscv_vle32_v_f32m2(r3, vl); - vfloat32m2_t _p0 = vle32_v_f32m2(r0, vl); - vfloat32m2_t _p1 = vle32_v_f32m2(r1, vl); - vfloat32m2_t _p2 = vle32_v_f32m2(r2, vl); - vfloat32m2_t _p3 = vle32_v_f32m2(r3, vl); - vsseg4e32_v_f32m2(outptr, _p0, _p1, _p2, _p3, vl); + __riscv_vsseg4e32_v_f32m2x4(outptr, __riscv_vcreate_v_f32m2x4(_p0, _p1, _p2, _p3), vl); r0 += vl; r1 += vl; @@ -527,16 +524,13 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& int n = size; while (n > 0) { - size_t vl = vsetvl_e32m2(n); - vfloat32m2_t _p0; - vfloat32m2_t _p1; - vfloat32m2_t _p2; - vfloat32m2_t _p3; - vlseg4e32_v_f32m2(&_p0, &_p1, &_p2, &_p3, r0, vl); - vse32_v_f32m2(outptr0, _p0, vl); - vse32_v_f32m2(outptr1, _p1, vl); - vse32_v_f32m2(outptr2, _p2, vl); - vse32_v_f32m2(outptr3, _p3, vl); + size_t vl = __riscv_vsetvl_e32m2(n); + + vfloat32m2x4_t _p = __riscv_vlseg4e32_v_f32m2x4(r0, vl); + __riscv_vse32_v_f32m2(outptr0, __riscv_vget_v_f32m2x4_f32m2(_p, 0), vl); + __riscv_vse32_v_f32m2(outptr1, __riscv_vget_v_f32m2x4_f32m2(_p, 1), vl); + __riscv_vse32_v_f32m2(outptr2, __riscv_vget_v_f32m2x4_f32m2(_p, 2), vl); + __riscv_vse32_v_f32m2(outptr3, __riscv_vget_v_f32m2x4_f32m2(_p, 3), vl); r0 += vl * 4; outptr0 += vl; @@ -578,17 +572,17 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& int n = size; while (n > 0) { - size_t vl = vsetvl_e32m1(n); - - vfloat32m1_t _p0 = vle32_v_f32m1(r0, vl); - vfloat32m1_t _p1 = vle32_v_f32m1(r1, vl); - vfloat32m1_t _p2 = vle32_v_f32m1(r2, vl); - vfloat32m1_t _p3 = vle32_v_f32m1(r3, vl); - vfloat32m1_t _p4 = vle32_v_f32m1(r4, vl); - vfloat32m1_t _p5 = vle32_v_f32m1(r5, vl); - vfloat32m1_t _p6 = vle32_v_f32m1(r6, vl); - vfloat32m1_t _p7 = vle32_v_f32m1(r7, vl); - vsseg8e32_v_f32m1(outptr, _p0, _p1, _p2, _p3, _p4, _p5, _p6, _p7, vl); + size_t vl = __riscv_vsetvl_e32m1(n); + + vfloat32m1_t _p0 = __riscv_vle32_v_f32m1(r0, vl); + vfloat32m1_t _p1 = __riscv_vle32_v_f32m1(r1, vl); + vfloat32m1_t _p2 = __riscv_vle32_v_f32m1(r2, vl); + vfloat32m1_t _p3 = __riscv_vle32_v_f32m1(r3, vl); + vfloat32m1_t _p4 = __riscv_vle32_v_f32m1(r4, vl); + vfloat32m1_t _p5 = __riscv_vle32_v_f32m1(r5, vl); + vfloat32m1_t _p6 = __riscv_vle32_v_f32m1(r6, vl); + vfloat32m1_t _p7 = __riscv_vle32_v_f32m1(r7, vl); + __riscv_vsseg8e32_v_f32m1x8(outptr, __riscv_vcreate_v_f32m1x8(_p0, _p1, _p2, _p3, _p4, _p5, _p6, _p7), vl); r0 += vl; r1 += vl; @@ -638,26 +632,17 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& int n = size; while (n > 0) { - size_t vl = vsetvl_e32m1(n); - - vfloat32m1_t _p0; - vfloat32m1_t _p1; - vfloat32m1_t _p2; - vfloat32m1_t _p3; - vfloat32m1_t _p4; - vfloat32m1_t _p5; - vfloat32m1_t _p6; - vfloat32m1_t _p7; - vlseg8e32_v_f32m1(&_p0, &_p1, &_p2, &_p3, &_p4, &_p5, &_p6, &_p7, r0, vl); - - vse32_v_f32m1(outptr0, _p0, vl); - vse32_v_f32m1(outptr1, _p1, vl); - vse32_v_f32m1(outptr2, _p2, vl); - vse32_v_f32m1(outptr3, _p3, vl); - vse32_v_f32m1(outptr4, _p4, vl); - vse32_v_f32m1(outptr5, _p5, vl); - vse32_v_f32m1(outptr6, _p6, vl); - vse32_v_f32m1(outptr7, _p7, vl); + size_t vl = __riscv_vsetvl_e32m1(n); + + vfloat32m1x8_t _p = __riscv_vlseg8e32_v_f32m1x8(r0, vl); + __riscv_vse32_v_f32m1(outptr0, __riscv_vget_v_f32m1x8_f32m1(_p, 0), vl); + __riscv_vse32_v_f32m1(outptr1, __riscv_vget_v_f32m1x8_f32m1(_p, 1), vl); + __riscv_vse32_v_f32m1(outptr2, __riscv_vget_v_f32m1x8_f32m1(_p, 2), vl); + __riscv_vse32_v_f32m1(outptr3, __riscv_vget_v_f32m1x8_f32m1(_p, 3), vl); + __riscv_vse32_v_f32m1(outptr4, __riscv_vget_v_f32m1x8_f32m1(_p, 4), vl); + __riscv_vse32_v_f32m1(outptr5, __riscv_vget_v_f32m1x8_f32m1(_p, 5), vl); + __riscv_vse32_v_f32m1(outptr6, __riscv_vget_v_f32m1x8_f32m1(_p, 6), vl); + __riscv_vse32_v_f32m1(outptr7, __riscv_vget_v_f32m1x8_f32m1(_p, 7), vl); r0 += vl * 8; outptr0 += vl; @@ -701,21 +686,21 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& int n = size; while (n > 0) { - size_t vl = vsetvl_e32m1(n); + size_t vl = __riscv_vsetvl_e32m1(n); - vfloat32m1_t _p00; - vfloat32m1_t _p01; - vfloat32m1_t _p02; - vfloat32m1_t _p03; - vlseg4e32_v_f32m1(&_p00, &_p01, &_p02, &_p03, r0, vl); + vfloat32m1x4_t _p0 = __riscv_vlseg4e32_v_f32m1x4(r0, vl); + vfloat32m1x4_t _p1 = __riscv_vlseg4e32_v_f32m1x4(r1, vl); - vfloat32m1_t _p10; - vfloat32m1_t _p11; - vfloat32m1_t _p12; - vfloat32m1_t _p13; - vlseg4e32_v_f32m1(&_p10, &_p11, &_p12, &_p13, r1, vl); + vfloat32m1_t _p00 = __riscv_vget_v_f32m1x4_f32m1(_p0, 0); + vfloat32m1_t _p01 = __riscv_vget_v_f32m1x4_f32m1(_p0, 1); + vfloat32m1_t _p02 = __riscv_vget_v_f32m1x4_f32m1(_p0, 2); + vfloat32m1_t _p03 = __riscv_vget_v_f32m1x4_f32m1(_p0, 3); + vfloat32m1_t _p10 = __riscv_vget_v_f32m1x4_f32m1(_p1, 0); + vfloat32m1_t _p11 = __riscv_vget_v_f32m1x4_f32m1(_p1, 1); + vfloat32m1_t _p12 = __riscv_vget_v_f32m1x4_f32m1(_p1, 2); + vfloat32m1_t _p13 = __riscv_vget_v_f32m1x4_f32m1(_p1, 3); - vsseg8e32_v_f32m1(outptr, _p00, _p01, _p02, _p03, _p10, _p11, _p12, _p13, vl); + __riscv_vsseg8e32_v_f32m1x8(outptr, __riscv_vcreate_v_f32m1x8(_p00, _p01, _p02, _p03, _p10, _p11, _p12, _p13), vl); r0 += vl * 4; r1 += vl * 4; @@ -755,19 +740,21 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& int n = size; while (n > 0) { - size_t vl = vsetvl_e32m1(n); - - vfloat32m1_t _p0; - vfloat32m1_t _p1; - vfloat32m1_t _p2; - vfloat32m1_t _p3; - vfloat32m1_t _p4; - vfloat32m1_t _p5; - vfloat32m1_t _p6; - vfloat32m1_t _p7; - vlseg8e32_v_f32m1(&_p0, &_p1, &_p2, &_p3, &_p4, &_p5, &_p6, &_p7, r0, vl); - vsseg4e32_v_f32m1(outptr0, _p0, _p1, _p2, _p3, vl); - vsseg4e32_v_f32m1(outptr1, _p4, _p5, _p6, _p7, vl); + size_t vl = __riscv_vsetvl_e32m1(n); + + vfloat32m1x8_t _p = __riscv_vlseg8e32_v_f32m1x8(r0, vl); + + vfloat32m1_t _p0 = __riscv_vget_v_f32m1x8_f32m1(_p, 0); + vfloat32m1_t _p1 = __riscv_vget_v_f32m1x8_f32m1(_p, 1); + vfloat32m1_t _p2 = __riscv_vget_v_f32m1x8_f32m1(_p, 2); + vfloat32m1_t _p3 = __riscv_vget_v_f32m1x8_f32m1(_p, 3); + vfloat32m1_t _p4 = __riscv_vget_v_f32m1x8_f32m1(_p, 4); + vfloat32m1_t _p5 = __riscv_vget_v_f32m1x8_f32m1(_p, 5); + vfloat32m1_t _p6 = __riscv_vget_v_f32m1x8_f32m1(_p, 6); + vfloat32m1_t _p7 = __riscv_vget_v_f32m1x8_f32m1(_p, 7); + + __riscv_vsseg4e32_v_f32m1x4(outptr0, __riscv_vcreate_v_f32m1x4(_p0, _p1, _p2, _p3), vl); + __riscv_vsseg4e32_v_f32m1x4(outptr1, __riscv_vcreate_v_f32m1x4(_p4, _p5, _p6, _p7), vl); r0 += vl * 8; outptr0 += vl * 4; @@ -889,13 +876,13 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co int n = w; while (n > 0) { - size_t vl = vsetvl_e16m2(n); + size_t vl = __riscv_vsetvl_e16m2(n); - vuint16m2_t _p0 = vle16_v_u16m2(r0, vl); - vuint16m2_t _p1 = vle16_v_u16m2(r1, vl); - vuint16m2_t _p2 = vle16_v_u16m2(r2, vl); - vuint16m2_t _p3 = vle16_v_u16m2(r3, vl); - vsseg4e16_v_u16m2(outptr, _p0, _p1, _p2, _p3, vl); + vuint16m2_t _p0 = __riscv_vle16_v_u16m2(r0, vl); + vuint16m2_t _p1 = __riscv_vle16_v_u16m2(r1, vl); + vuint16m2_t _p2 = __riscv_vle16_v_u16m2(r2, vl); + vuint16m2_t _p3 = __riscv_vle16_v_u16m2(r3, vl); + __riscv_vsseg4e16_v_u16m2x4(outptr, __riscv_vcreate_v_u16m2x4(_p0, _p1, _p2, _p3), vl); r0 += vl; r1 += vl; @@ -933,17 +920,13 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co int n = w; while (n > 0) { - size_t vl = vsetvl_e16m2(n); - - vuint16m2_t _p0; - vuint16m2_t _p1; - vuint16m2_t _p2; - vuint16m2_t _p3; - vlseg4e16_v_u16m2(&_p0, &_p1, &_p2, &_p3, r0, vl); - vse16_v_u16m2(outptr0, _p0, vl); - vse16_v_u16m2(outptr1, _p1, vl); - vse16_v_u16m2(outptr2, _p2, vl); - vse16_v_u16m2(outptr3, _p3, vl); + size_t vl = __riscv_vsetvl_e16m2(n); + + vuint16m2x4_t _p = __riscv_vlseg4e16_v_u16m2x4(r0, vl); + __riscv_vse16_v_u16m2(outptr0, __riscv_vget_v_u16m2x4_u16m2(_p, 0), vl); + __riscv_vse16_v_u16m2(outptr1, __riscv_vget_v_u16m2x4_u16m2(_p, 1), vl); + __riscv_vse16_v_u16m2(outptr2, __riscv_vget_v_u16m2x4_u16m2(_p, 2), vl); + __riscv_vse16_v_u16m2(outptr3, __riscv_vget_v_u16m2x4_u16m2(_p, 3), vl); r0 += vl * 4; outptr0 += vl; @@ -985,17 +968,17 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co int n = w; while (n > 0) { - size_t vl = vsetvl_e16m1(n); - - vuint16m1_t _p0 = vle16_v_u16m1(r0, vl); - vuint16m1_t _p1 = vle16_v_u16m1(r1, vl); - vuint16m1_t _p2 = vle16_v_u16m1(r2, vl); - vuint16m1_t _p3 = vle16_v_u16m1(r3, vl); - vuint16m1_t _p4 = vle16_v_u16m1(r4, vl); - vuint16m1_t _p5 = vle16_v_u16m1(r5, vl); - vuint16m1_t _p6 = vle16_v_u16m1(r6, vl); - vuint16m1_t _p7 = vle16_v_u16m1(r7, vl); - vsseg8e16_v_u16m1(outptr, _p0, _p1, _p2, _p3, _p4, _p5, _p6, _p7, vl); + size_t vl = __riscv_vsetvl_e16m1(n); + + vuint16m1_t _p0 = __riscv_vle16_v_u16m1(r0, vl); + vuint16m1_t _p1 = __riscv_vle16_v_u16m1(r1, vl); + vuint16m1_t _p2 = __riscv_vle16_v_u16m1(r2, vl); + vuint16m1_t _p3 = __riscv_vle16_v_u16m1(r3, vl); + vuint16m1_t _p4 = __riscv_vle16_v_u16m1(r4, vl); + vuint16m1_t _p5 = __riscv_vle16_v_u16m1(r5, vl); + vuint16m1_t _p6 = __riscv_vle16_v_u16m1(r6, vl); + vuint16m1_t _p7 = __riscv_vle16_v_u16m1(r7, vl); + __riscv_vsseg8e16_v_u16m1x8(outptr, __riscv_vcreate_v_u16m1x8(_p0, _p1, _p2, _p3, _p4, _p5, _p6, _p7), vl); r0 += vl; r1 += vl; @@ -1045,26 +1028,17 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co int n = w; while (n > 0) { - size_t vl = vsetvl_e16m1(n); - - vuint16m1_t _p0; - vuint16m1_t _p1; - vuint16m1_t _p2; - vuint16m1_t _p3; - vuint16m1_t _p4; - vuint16m1_t _p5; - vuint16m1_t _p6; - vuint16m1_t _p7; - vlseg8e16_v_u16m1(&_p0, &_p1, &_p2, &_p3, &_p4, &_p5, &_p6, &_p7, r0, vl); - - vse16_v_u16m1(outptr0, _p0, vl); - vse16_v_u16m1(outptr1, _p1, vl); - vse16_v_u16m1(outptr2, _p2, vl); - vse16_v_u16m1(outptr3, _p3, vl); - vse16_v_u16m1(outptr4, _p4, vl); - vse16_v_u16m1(outptr5, _p5, vl); - vse16_v_u16m1(outptr6, _p6, vl); - vse16_v_u16m1(outptr7, _p7, vl); + size_t vl = __riscv_vsetvl_e16m1(n); + + vuint16m1x8_t _p = __riscv_vlseg8e16_v_u16m1x8(r0, vl); + __riscv_vse16_v_u16m1(outptr0, __riscv_vget_v_u16m1x8_u16m1(_p, 0), vl); + __riscv_vse16_v_u16m1(outptr1, __riscv_vget_v_u16m1x8_u16m1(_p, 1), vl); + __riscv_vse16_v_u16m1(outptr2, __riscv_vget_v_u16m1x8_u16m1(_p, 2), vl); + __riscv_vse16_v_u16m1(outptr3, __riscv_vget_v_u16m1x8_u16m1(_p, 3), vl); + __riscv_vse16_v_u16m1(outptr4, __riscv_vget_v_u16m1x8_u16m1(_p, 4), vl); + __riscv_vse16_v_u16m1(outptr5, __riscv_vget_v_u16m1x8_u16m1(_p, 5), vl); + __riscv_vse16_v_u16m1(outptr6, __riscv_vget_v_u16m1x8_u16m1(_p, 6), vl); + __riscv_vse16_v_u16m1(outptr7, __riscv_vget_v_u16m1x8_u16m1(_p, 7), vl); r0 += vl * 8; outptr0 += vl; @@ -1108,21 +1082,21 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co int n = w; while (n > 0) { - size_t vl = vsetvl_e16m1(n); + size_t vl = __riscv_vsetvl_e16m1(n); - vuint16m1_t _p00; - vuint16m1_t _p01; - vuint16m1_t _p02; - vuint16m1_t _p03; - vlseg4e16_v_u16m1(&_p00, &_p01, &_p02, &_p03, r0, vl); + vuint16m1x4_t _p0 = __riscv_vlseg4e16_v_u16m1x4(r0, vl); + vuint16m1x4_t _p1 = __riscv_vlseg4e16_v_u16m1x4(r1, vl); - vuint16m1_t _p10; - vuint16m1_t _p11; - vuint16m1_t _p12; - vuint16m1_t _p13; - vlseg4e16_v_u16m1(&_p10, &_p11, &_p12, &_p13, r1, vl); + vuint16m1_t _p00 = __riscv_vget_v_u16m1x4_u16m1(_p0, 0); + vuint16m1_t _p01 = __riscv_vget_v_u16m1x4_u16m1(_p0, 1); + vuint16m1_t _p02 = __riscv_vget_v_u16m1x4_u16m1(_p0, 2); + vuint16m1_t _p03 = __riscv_vget_v_u16m1x4_u16m1(_p0, 3); + vuint16m1_t _p10 = __riscv_vget_v_u16m1x4_u16m1(_p1, 0); + vuint16m1_t _p11 = __riscv_vget_v_u16m1x4_u16m1(_p1, 1); + vuint16m1_t _p12 = __riscv_vget_v_u16m1x4_u16m1(_p1, 2); + vuint16m1_t _p13 = __riscv_vget_v_u16m1x4_u16m1(_p1, 3); - vsseg8e16_v_u16m1(outptr, _p00, _p01, _p02, _p03, _p10, _p11, _p12, _p13, vl); + __riscv_vsseg8e16_v_u16m1x8(outptr, __riscv_vcreate_v_u16m1x8(_p00, _p01, _p02, _p03, _p10, _p11, _p12, _p13), vl); r0 += vl * 4; r1 += vl * 4; @@ -1162,20 +1136,21 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co int n = w; while (n > 0) { - size_t vl = vsetvl_e16m1(n); + size_t vl = __riscv_vsetvl_e16m1(n); - vuint16m1_t _p0; - vuint16m1_t _p1; - vuint16m1_t _p2; - vuint16m1_t _p3; - vuint16m1_t _p4; - vuint16m1_t _p5; - vuint16m1_t _p6; - vuint16m1_t _p7; - vlseg8e16_v_u16m1(&_p0, &_p1, &_p2, &_p3, &_p4, &_p5, &_p6, &_p7, r0, vl); + vuint16m1x8_t _p = __riscv_vlseg8e16_v_u16m1x8(r0, vl); - vsseg4e16_v_u16m1(outptr0, _p0, _p1, _p2, _p3, vl); - vsseg4e16_v_u16m1(outptr1, _p4, _p5, _p6, _p7, vl); + vuint16m1_t _p0 = __riscv_vget_v_u16m1x8_u16m1(_p, 0); + vuint16m1_t _p1 = __riscv_vget_v_u16m1x8_u16m1(_p, 1); + vuint16m1_t _p2 = __riscv_vget_v_u16m1x8_u16m1(_p, 2); + vuint16m1_t _p3 = __riscv_vget_v_u16m1x8_u16m1(_p, 3); + vuint16m1_t _p4 = __riscv_vget_v_u16m1x8_u16m1(_p, 4); + vuint16m1_t _p5 = __riscv_vget_v_u16m1x8_u16m1(_p, 5); + vuint16m1_t _p6 = __riscv_vget_v_u16m1x8_u16m1(_p, 6); + vuint16m1_t _p7 = __riscv_vget_v_u16m1x8_u16m1(_p, 7); + + __riscv_vsseg4e16_v_u16m1x4(outptr0, __riscv_vcreate_v_u16m1x4(_p0, _p1, _p2, _p3), vl); + __riscv_vsseg4e16_v_u16m1x4(outptr1, __riscv_vcreate_v_u16m1x4(_p4, _p5, _p6, _p7), vl); r0 += vl * 8; outptr0 += vl * 4; @@ -1234,13 +1209,13 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co int n = size; while (n > 0) { - size_t vl = vsetvl_e16m2(n); + size_t vl = __riscv_vsetvl_e16m2(n); - vuint16m2_t _p0 = vle16_v_u16m2(r0, vl); - vuint16m2_t _p1 = vle16_v_u16m2(r1, vl); - vuint16m2_t _p2 = vle16_v_u16m2(r2, vl); - vuint16m2_t _p3 = vle16_v_u16m2(r3, vl); - vsseg4e16_v_u16m2(outptr, _p0, _p1, _p2, _p3, vl); + vuint16m2_t _p0 = __riscv_vle16_v_u16m2(r0, vl); + vuint16m2_t _p1 = __riscv_vle16_v_u16m2(r1, vl); + vuint16m2_t _p2 = __riscv_vle16_v_u16m2(r2, vl); + vuint16m2_t _p3 = __riscv_vle16_v_u16m2(r3, vl); + __riscv_vsseg4e16_v_u16m2x4(outptr, __riscv_vcreate_v_u16m2x4(_p0, _p1, _p2, _p3), vl); r0 += vl; r1 += vl; @@ -1278,17 +1253,13 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co int n = size; while (n > 0) { - size_t vl = vsetvl_e16m2(n); - - vuint16m2_t _p0; - vuint16m2_t _p1; - vuint16m2_t _p2; - vuint16m2_t _p3; - vlseg4e16_v_u16m2(&_p0, &_p1, &_p2, &_p3, r0, vl); - vse16_v_u16m2(outptr0, _p0, vl); - vse16_v_u16m2(outptr1, _p1, vl); - vse16_v_u16m2(outptr2, _p2, vl); - vse16_v_u16m2(outptr3, _p3, vl); + size_t vl = __riscv_vsetvl_e16m2(n); + + vuint16m2x4_t _p = __riscv_vlseg4e16_v_u16m2x4(r0, vl); + __riscv_vse16_v_u16m2(outptr0, __riscv_vget_v_u16m2x4_u16m2(_p, 0), vl); + __riscv_vse16_v_u16m2(outptr1, __riscv_vget_v_u16m2x4_u16m2(_p, 1), vl); + __riscv_vse16_v_u16m2(outptr2, __riscv_vget_v_u16m2x4_u16m2(_p, 2), vl); + __riscv_vse16_v_u16m2(outptr3, __riscv_vget_v_u16m2x4_u16m2(_p, 3), vl); r0 += vl * 4; outptr0 += vl; @@ -1330,17 +1301,17 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co int n = size; while (n > 0) { - size_t vl = vsetvl_e16m1(n); - - vuint16m1_t _p0 = vle16_v_u16m1(r0, vl); - vuint16m1_t _p1 = vle16_v_u16m1(r1, vl); - vuint16m1_t _p2 = vle16_v_u16m1(r2, vl); - vuint16m1_t _p3 = vle16_v_u16m1(r3, vl); - vuint16m1_t _p4 = vle16_v_u16m1(r4, vl); - vuint16m1_t _p5 = vle16_v_u16m1(r5, vl); - vuint16m1_t _p6 = vle16_v_u16m1(r6, vl); - vuint16m1_t _p7 = vle16_v_u16m1(r7, vl); - vsseg8e16_v_u16m1(outptr, _p0, _p1, _p2, _p3, _p4, _p5, _p6, _p7, vl); + size_t vl = __riscv_vsetvl_e16m1(n); + + vuint16m1_t _p0 = __riscv_vle16_v_u16m1(r0, vl); + vuint16m1_t _p1 = __riscv_vle16_v_u16m1(r1, vl); + vuint16m1_t _p2 = __riscv_vle16_v_u16m1(r2, vl); + vuint16m1_t _p3 = __riscv_vle16_v_u16m1(r3, vl); + vuint16m1_t _p4 = __riscv_vle16_v_u16m1(r4, vl); + vuint16m1_t _p5 = __riscv_vle16_v_u16m1(r5, vl); + vuint16m1_t _p6 = __riscv_vle16_v_u16m1(r6, vl); + vuint16m1_t _p7 = __riscv_vle16_v_u16m1(r7, vl); + __riscv_vsseg8e16_v_u16m1x8(outptr, __riscv_vcreate_v_u16m1x8(_p0, _p1, _p2, _p3, _p4, _p5, _p6, _p7), vl); r0 += vl; r1 += vl; @@ -1390,25 +1361,17 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co int n = size; while (n > 0) { - size_t vl = vsetvl_e16m1(n); - - vuint16m1_t _p0; - vuint16m1_t _p1; - vuint16m1_t _p2; - vuint16m1_t _p3; - vuint16m1_t _p4; - vuint16m1_t _p5; - vuint16m1_t _p6; - vuint16m1_t _p7; - vlseg8e16_v_u16m1(&_p0, &_p1, &_p2, &_p3, &_p4, &_p5, &_p6, &_p7, r0, vl); - vse16_v_u16m1(outptr0, _p0, vl); - vse16_v_u16m1(outptr1, _p1, vl); - vse16_v_u16m1(outptr2, _p2, vl); - vse16_v_u16m1(outptr3, _p3, vl); - vse16_v_u16m1(outptr4, _p4, vl); - vse16_v_u16m1(outptr5, _p5, vl); - vse16_v_u16m1(outptr6, _p6, vl); - vse16_v_u16m1(outptr7, _p7, vl); + size_t vl = __riscv_vsetvl_e16m1(n); + + vuint16m1x8_t _p = __riscv_vlseg8e16_v_u16m1x8(r0, vl); + __riscv_vse16_v_u16m1(outptr0, __riscv_vget_v_u16m1x8_u16m1(_p, 0), vl); + __riscv_vse16_v_u16m1(outptr1, __riscv_vget_v_u16m1x8_u16m1(_p, 1), vl); + __riscv_vse16_v_u16m1(outptr2, __riscv_vget_v_u16m1x8_u16m1(_p, 2), vl); + __riscv_vse16_v_u16m1(outptr3, __riscv_vget_v_u16m1x8_u16m1(_p, 3), vl); + __riscv_vse16_v_u16m1(outptr4, __riscv_vget_v_u16m1x8_u16m1(_p, 4), vl); + __riscv_vse16_v_u16m1(outptr5, __riscv_vget_v_u16m1x8_u16m1(_p, 5), vl); + __riscv_vse16_v_u16m1(outptr6, __riscv_vget_v_u16m1x8_u16m1(_p, 6), vl); + __riscv_vse16_v_u16m1(outptr7, __riscv_vget_v_u16m1x8_u16m1(_p, 7), vl); r0 += vl * 8; outptr0 += vl; @@ -1452,21 +1415,20 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co int n = size; while (n > 0) { - size_t vl = vsetvl_e16m1(n); + size_t vl = __riscv_vsetvl_e16m1(n); - vuint16m1_t _p00; - vuint16m1_t _p01; - vuint16m1_t _p02; - vuint16m1_t _p03; - vlseg4e16_v_u16m1(&_p00, &_p01, &_p02, &_p03, r0, vl); + vuint16m1x4_t _p0 = __riscv_vlseg4e16_v_u16m1x4(r0, vl); + vuint16m1x4_t _p1 = __riscv_vlseg4e16_v_u16m1x4(r1, vl); - vuint16m1_t _p10; - vuint16m1_t _p11; - vuint16m1_t _p12; - vuint16m1_t _p13; - vlseg4e16_v_u16m1(&_p10, &_p11, &_p12, &_p13, r1, vl); - - vsseg8e16_v_u16m1(outptr, _p00, _p01, _p02, _p03, _p10, _p11, _p12, _p13, vl); + vuint16m1_t _p00 = __riscv_vget_v_u16m1x4_u16m1(_p0, 0); + vuint16m1_t _p01 = __riscv_vget_v_u16m1x4_u16m1(_p0, 1); + vuint16m1_t _p02 = __riscv_vget_v_u16m1x4_u16m1(_p0, 2); + vuint16m1_t _p03 = __riscv_vget_v_u16m1x4_u16m1(_p0, 3); + vuint16m1_t _p10 = __riscv_vget_v_u16m1x4_u16m1(_p1, 0); + vuint16m1_t _p11 = __riscv_vget_v_u16m1x4_u16m1(_p1, 1); + vuint16m1_t _p12 = __riscv_vget_v_u16m1x4_u16m1(_p1, 2); + vuint16m1_t _p13 = __riscv_vget_v_u16m1x4_u16m1(_p1, 3); + __riscv_vsseg8e16_v_u16m1x8(outptr, __riscv_vcreate_v_u16m1x8(_p00, _p01, _p02, _p03, _p10, _p11, _p12, _p13), vl); r0 += vl * 4; r1 += vl * 4; @@ -1506,20 +1468,21 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co int n = size; while (n > 0) { - size_t vl = vsetvl_e16m1(n); - - vuint16m1_t _p0; - vuint16m1_t _p1; - vuint16m1_t _p2; - vuint16m1_t _p3; - vuint16m1_t _p4; - vuint16m1_t _p5; - vuint16m1_t _p6; - vuint16m1_t _p7; - vlseg8e16_v_u16m1(&_p0, &_p1, &_p2, &_p3, &_p4, &_p5, &_p6, &_p7, r0, vl); - - vsseg4e16_v_u16m1(outptr0, _p0, _p1, _p2, _p3, vl); - vsseg4e16_v_u16m1(outptr1, _p4, _p5, _p6, _p7, vl); + size_t vl = __riscv_vsetvl_e16m1(n); + + vuint16m1x8_t _p = __riscv_vlseg8e16_v_u16m1x8(r0, vl); + + vuint16m1_t _p0 = __riscv_vget_v_u16m1x8_u16m1(_p, 0); + vuint16m1_t _p1 = __riscv_vget_v_u16m1x8_u16m1(_p, 1); + vuint16m1_t _p2 = __riscv_vget_v_u16m1x8_u16m1(_p, 2); + vuint16m1_t _p3 = __riscv_vget_v_u16m1x8_u16m1(_p, 3); + vuint16m1_t _p4 = __riscv_vget_v_u16m1x8_u16m1(_p, 4); + vuint16m1_t _p5 = __riscv_vget_v_u16m1x8_u16m1(_p, 5); + vuint16m1_t _p6 = __riscv_vget_v_u16m1x8_u16m1(_p, 6); + vuint16m1_t _p7 = __riscv_vget_v_u16m1x8_u16m1(_p, 7); + + __riscv_vsseg4e16_v_u16m1x4(outptr0, __riscv_vcreate_v_u16m1x4(_p0, _p1, _p2, _p3), vl); + __riscv_vsseg4e16_v_u16m1x4(outptr1, __riscv_vcreate_v_u16m1x4(_p4, _p5, _p6, _p7), vl); r0 += vl * 8; outptr0 += vl * 4; diff --git a/src/layer/riscv/padding_packn.h b/src/layer/riscv/padding_packn.h index 50f5efe1216..3b057a98522 100644 --- a/src/layer/riscv/padding_packn.h +++ b/src/layer/riscv/padding_packn.h @@ -16,7 +16,7 @@ static void padding_constant_packn_##VT##_rvv(const Mat& src, Mat& dst, int top, int bottom, int left, int right, v##VT##m##LMUL##_t v) \ { \ const int packn = csrr_vlenb() / sizeof(T); \ - const size_t vl = vsetvl_e##SEW##m##LMUL(packn); \ + const size_t vl = __riscv_vsetvl_e##SEW##m##LMUL(packn); \ \ const T* ptr = src; \ T* outptr = dst; \ @@ -26,7 +26,7 @@ { \ for (int x = 0; x < dst.w; x++) \ { \ - vse##SEW##_v_##TSEW##m##LMUL(outptr, v, vl); \ + __riscv_vse##SEW##_v_##TSEW##m##LMUL(outptr, v, vl); \ outptr += packn; \ } \ } \ @@ -35,19 +35,19 @@ { \ for (int x = 0; x < left; x++) \ { \ - vse##SEW##_v_##TSEW##m##LMUL(outptr, v, vl); \ + __riscv_vse##SEW##_v_##TSEW##m##LMUL(outptr, v, vl); \ outptr += packn; \ } \ for (int x = 0; x < src.w; x++) \ { \ - v##VT##m##LMUL##_t _p = vle##SEW##_v_##TSEW##m##LMUL(ptr, vl); \ - vse##SEW##_v_##TSEW##m##LMUL(outptr, _p, vl); \ + v##VT##m##LMUL##_t _p = __riscv_vle##SEW##_v_##TSEW##m##LMUL(ptr, vl); \ + __riscv_vse##SEW##_v_##TSEW##m##LMUL(outptr, _p, vl); \ ptr += packn; \ outptr += packn; \ } \ for (int x = 0; x < right; x++) \ { \ - vse##SEW##_v_##TSEW##m##LMUL(outptr, v, vl); \ + __riscv_vse##SEW##_v_##TSEW##m##LMUL(outptr, v, vl); \ outptr += packn; \ } \ } \ @@ -56,7 +56,7 @@ { \ for (int x = 0; x < dst.w; x++) \ { \ - vse##SEW##_v_##TSEW##m##LMUL(outptr, v, vl); \ + __riscv_vse##SEW##_v_##TSEW##m##LMUL(outptr, v, vl); \ outptr += packn; \ } \ } \ @@ -65,7 +65,7 @@ static void padding_replicate_packn_##VT##_rvv(const Mat& src, Mat& dst, int top, int bottom, int left, int right) \ { \ const int packn = csrr_vlenb() / sizeof(T); \ - const size_t vl = vsetvl_e##SEW##m##LMUL(packn); \ + const size_t vl = __riscv_vsetvl_e##SEW##m##LMUL(packn); \ \ const T* ptr = src; \ T* outptr = dst; \ @@ -74,44 +74,44 @@ for (int y = 0; y < top; y++) \ { \ const T* ptr0 = ptr; \ - v##VT##m##LMUL##_t _p = vle##SEW##_v_##TSEW##m##LMUL(ptr0, vl); \ + v##VT##m##LMUL##_t _p = __riscv_vle##SEW##_v_##TSEW##m##LMUL(ptr0, vl); \ for (int x = 0; x < left; x++) \ { \ - vse##SEW##_v_##TSEW##m##LMUL(outptr, _p, vl); \ + __riscv_vse##SEW##_v_##TSEW##m##LMUL(outptr, _p, vl); \ outptr += packn; \ } \ for (int x = 0; x < src.w; x++) \ { \ - _p = vle##SEW##_v_##TSEW##m##LMUL(ptr0, vl); \ - vse##SEW##_v_##TSEW##m##LMUL(outptr, _p, vl); \ + _p = __riscv_vle##SEW##_v_##TSEW##m##LMUL(ptr0, vl); \ + __riscv_vse##SEW##_v_##TSEW##m##LMUL(outptr, _p, vl); \ ptr0 += packn; \ outptr += packn; \ } \ for (int x = 0; x < right; x++) \ { \ - vse##SEW##_v_##TSEW##m##LMUL(outptr, _p, vl); \ + __riscv_vse##SEW##_v_##TSEW##m##LMUL(outptr, _p, vl); \ outptr += packn; \ } \ } \ /* fill center */ \ for (int y = 0; y < src.h; y++) \ { \ - v##VT##m##LMUL##_t _p = vle##SEW##_v_##TSEW##m##LMUL(ptr, vl); \ + v##VT##m##LMUL##_t _p = __riscv_vle##SEW##_v_##TSEW##m##LMUL(ptr, vl); \ for (int x = 0; x < left; x++) \ { \ - vse##SEW##_v_##TSEW##m##LMUL(outptr, _p, vl); \ + __riscv_vse##SEW##_v_##TSEW##m##LMUL(outptr, _p, vl); \ outptr += packn; \ } \ for (int x = 0; x < src.w; x++) \ { \ - _p = vle##SEW##_v_##TSEW##m##LMUL(ptr, vl); \ - vse##SEW##_v_##TSEW##m##LMUL(outptr, _p, vl); \ + _p = __riscv_vle##SEW##_v_##TSEW##m##LMUL(ptr, vl); \ + __riscv_vse##SEW##_v_##TSEW##m##LMUL(outptr, _p, vl); \ ptr += packn; \ outptr += packn; \ } \ for (int x = 0; x < right; x++) \ { \ - vse##SEW##_v_##TSEW##m##LMUL(outptr, _p, vl); \ + __riscv_vse##SEW##_v_##TSEW##m##LMUL(outptr, _p, vl); \ outptr += packn; \ } \ } \ @@ -120,22 +120,22 @@ for (int y = 0; y < bottom; y++) \ { \ const T* ptr0 = ptr; \ - v##VT##m##LMUL##_t _p = vle##SEW##_v_##TSEW##m##LMUL(ptr0, vl); \ + v##VT##m##LMUL##_t _p = __riscv_vle##SEW##_v_##TSEW##m##LMUL(ptr0, vl); \ for (int x = 0; x < left; x++) \ { \ - vse##SEW##_v_##TSEW##m##LMUL(outptr, _p, vl); \ + __riscv_vse##SEW##_v_##TSEW##m##LMUL(outptr, _p, vl); \ outptr += packn; \ } \ for (int x = 0; x < src.w; x++) \ { \ - _p = vle##SEW##_v_##TSEW##m##LMUL(ptr0, vl); \ - vse##SEW##_v_##TSEW##m##LMUL(outptr, _p, vl); \ + _p = __riscv_vle##SEW##_v_##TSEW##m##LMUL(ptr0, vl); \ + __riscv_vse##SEW##_v_##TSEW##m##LMUL(outptr, _p, vl); \ ptr0 += packn; \ outptr += packn; \ } \ for (int x = 0; x < right; x++) \ { \ - vse##SEW##_v_##TSEW##m##LMUL(outptr, _p, vl); \ + __riscv_vse##SEW##_v_##TSEW##m##LMUL(outptr, _p, vl); \ outptr += packn; \ } \ } \ @@ -144,7 +144,7 @@ static void padding_reflect_packn_##VT##_rvv(const Mat& src, Mat& dst, int top, int bottom, int left, int right) \ { \ const int packn = csrr_vlenb() / sizeof(T); \ - const size_t vl = vsetvl_e##SEW##m##LMUL(packn); \ + const size_t vl = __riscv_vsetvl_e##SEW##m##LMUL(packn); \ \ const T* ptr = src; \ T* outptr = dst; \ @@ -156,21 +156,21 @@ const T* ptr0 = ptr; \ for (int x = 0; x < left; x++) \ { \ - v##VT##m##LMUL##_t _p = vle##SEW##_v_##TSEW##m##LMUL(ptr0 + (left - x) * packn, vl); \ - vse##SEW##_v_##TSEW##m##LMUL(outptr, _p, vl); \ + v##VT##m##LMUL##_t _p = __riscv_vle##SEW##_v_##TSEW##m##LMUL(ptr0 + (left - x) * packn, vl); \ + __riscv_vse##SEW##_v_##TSEW##m##LMUL(outptr, _p, vl); \ outptr += packn; \ } \ for (int x = 0; x < src.w; x++) \ { \ - v##VT##m##LMUL##_t _p = vle##SEW##_v_##TSEW##m##LMUL(ptr0, vl); \ - vse##SEW##_v_##TSEW##m##LMUL(outptr, _p, vl); \ + v##VT##m##LMUL##_t _p = __riscv_vle##SEW##_v_##TSEW##m##LMUL(ptr0, vl); \ + __riscv_vse##SEW##_v_##TSEW##m##LMUL(outptr, _p, vl); \ ptr0 += packn; \ outptr += packn; \ } \ for (int x = 0; x < right; x++) \ { \ - v##VT##m##LMUL##_t _p = vle##SEW##_v_##TSEW##m##LMUL(ptr0 - packn * 2 - x * packn, vl); \ - vse##SEW##_v_##TSEW##m##LMUL(outptr, _p, vl); \ + v##VT##m##LMUL##_t _p = __riscv_vle##SEW##_v_##TSEW##m##LMUL(ptr0 - packn * 2 - x * packn, vl); \ + __riscv_vse##SEW##_v_##TSEW##m##LMUL(outptr, _p, vl); \ outptr += packn; \ } \ ptr -= src.w * packn; \ @@ -180,21 +180,21 @@ { \ for (int x = 0; x < left; x++) \ { \ - v##VT##m##LMUL##_t _p = vle##SEW##_v_##TSEW##m##LMUL(ptr + (left - x) * packn, vl); \ - vse##SEW##_v_##TSEW##m##LMUL(outptr, _p, vl); \ + v##VT##m##LMUL##_t _p = __riscv_vle##SEW##_v_##TSEW##m##LMUL(ptr + (left - x) * packn, vl); \ + __riscv_vse##SEW##_v_##TSEW##m##LMUL(outptr, _p, vl); \ outptr += packn; \ } \ for (int x = 0; x < src.w; x++) \ { \ - v##VT##m##LMUL##_t _p = vle##SEW##_v_##TSEW##m##LMUL(ptr, vl); \ - vse##SEW##_v_##TSEW##m##LMUL(outptr, _p, vl); \ + v##VT##m##LMUL##_t _p = __riscv_vle##SEW##_v_##TSEW##m##LMUL(ptr, vl); \ + __riscv_vse##SEW##_v_##TSEW##m##LMUL(outptr, _p, vl); \ ptr += packn; \ outptr += packn; \ } \ for (int x = 0; x < right; x++) \ { \ - v##VT##m##LMUL##_t _p = vle##SEW##_v_##TSEW##m##LMUL(ptr - packn * 2 - x * packn, vl); \ - vse##SEW##_v_##TSEW##m##LMUL(outptr, _p, vl); \ + v##VT##m##LMUL##_t _p = __riscv_vle##SEW##_v_##TSEW##m##LMUL(ptr - packn * 2 - x * packn, vl); \ + __riscv_vse##SEW##_v_##TSEW##m##LMUL(outptr, _p, vl); \ outptr += packn; \ } \ } \ @@ -205,21 +205,21 @@ const T* ptr0 = ptr; \ for (int x = 0; x < left; x++) \ { \ - v##VT##m##LMUL##_t _p = vle##SEW##_v_##TSEW##m##LMUL(ptr0 + (left - x) * packn, vl); \ - vse##SEW##_v_##TSEW##m##LMUL(outptr, _p, vl); \ + v##VT##m##LMUL##_t _p = __riscv_vle##SEW##_v_##TSEW##m##LMUL(ptr0 + (left - x) * packn, vl); \ + __riscv_vse##SEW##_v_##TSEW##m##LMUL(outptr, _p, vl); \ outptr += packn; \ } \ for (int x = 0; x < src.w; x++) \ { \ - v##VT##m##LMUL##_t _p = vle##SEW##_v_##TSEW##m##LMUL(ptr0, vl); \ - vse##SEW##_v_##TSEW##m##LMUL(outptr, _p, vl); \ + v##VT##m##LMUL##_t _p = __riscv_vle##SEW##_v_##TSEW##m##LMUL(ptr0, vl); \ + __riscv_vse##SEW##_v_##TSEW##m##LMUL(outptr, _p, vl); \ ptr0 += packn; \ outptr += packn; \ } \ for (int x = 0; x < right; x++) \ { \ - v##VT##m##LMUL##_t _p = vle##SEW##_v_##TSEW##m##LMUL(ptr0 - packn * 2 - x * packn, vl); \ - vse##SEW##_v_##TSEW##m##LMUL(outptr, _p, vl); \ + v##VT##m##LMUL##_t _p = __riscv_vle##SEW##_v_##TSEW##m##LMUL(ptr0 - packn * 2 - x * packn, vl); \ + __riscv_vse##SEW##_v_##TSEW##m##LMUL(outptr, _p, vl); \ outptr += packn; \ } \ ptr -= src.w * packn; \ diff --git a/src/layer/riscv/padding_riscv.cpp b/src/layer/riscv/padding_riscv.cpp index 8f4b54da590..87a9ef2e827 100644 --- a/src/layer/riscv/padding_riscv.cpp +++ b/src/layer/riscv/padding_riscv.cpp @@ -20,6 +20,8 @@ #include "riscv_usability.h" +#include "cpu.h" + namespace ncnn { #if __riscv_vector @@ -30,10 +32,14 @@ Padding_riscv::Padding_riscv() { #if __riscv_vector support_packing = true; -#if __riscv_zfh - support_fp16_storage = true; -#endif #endif // __riscv_vector +#if NCNN_ZFH +#if __riscv_vector + support_fp16_storage = cpu_support_riscv_zvfh(); +#else + support_fp16_storage = cpu_support_riscv_zfh(); +#endif +#endif #if NCNN_BF16 support_bf16_storage = true; @@ -42,9 +48,11 @@ Padding_riscv::Padding_riscv() int Padding_riscv::create_pipeline(const Option& opt) { -#if __riscv_vector && __riscv_zfh +#if NCNN_ZFH if (opt.use_fp16_storage) { + value_fp16 = float32_to_float16(value); + ncnn::cast_float32_to_float16(per_channel_pad_data, per_channel_pad_data_fp16, opt); } #endif @@ -79,7 +87,7 @@ int Padding_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& if (elembits == 8) return forward_int8(bottom_blob, top_blob, opt); -#if __riscv_vector && __riscv_zfh +#if NCNN_ZFH if (opt.use_fp16_storage && elembits == 16) return forward_bf16s_fp16s(bottom_blob, top_blob, opt); #endif @@ -91,7 +99,7 @@ int Padding_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& #if __riscv_vector const int packn = csrr_vlenb() / 4; - const size_t vl = vsetvl_e32m1(packn); + const size_t vl = __riscv_vsetvl_e32m1(packn); #endif int w = bottom_blob.w; @@ -118,7 +126,7 @@ int Padding_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& if (top_blob.empty()) return -100; - vfloat32m1_t pad_value = vfmv_v_f_f32m1(value, vl); + vfloat32m1_t pad_value = __riscv_vfmv_v_f_f32m1(value, vl); padding_constant_packn_float32_rvv(bottom_blob, top_blob, 0, 0, left / packn, right / packn, pad_value); return 0; @@ -139,7 +147,7 @@ int Padding_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& if (top_blob.empty()) return -100; - vfloat32m1_t pad_value = vfmv_v_f_f32m1(value, vl); + vfloat32m1_t pad_value = __riscv_vfmv_v_f_f32m1(value, vl); padding_constant_packn_float32_rvv(bottom_blob, top_blob, top / packn, bottom / packn, left, right, pad_value); return 0; @@ -167,7 +175,7 @@ int Padding_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& { Mat borderm = top_blob.channel(q); - vfloat32m1_t pad_value = per_channel_pad_data_size ? vle32_v_f32m1((const float*)per_channel_pad_data + q * packn, vl) : vfmv_v_f_f32m1(value, vl); + vfloat32m1_t pad_value = per_channel_pad_data_size ? __riscv_vle32_v_f32m1((const float*)per_channel_pad_data + q * packn, vl) : __riscv_vfmv_v_f_f32m1(value, vl); //Channel padding if ((q - front_) < 0 || (q - front_) >= channels) { @@ -204,7 +212,7 @@ int Padding_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < channels; q++) { - vfloat32m1_t pad_value = per_channel_pad_data_size ? vle32_v_f32m1((const float*)per_channel_pad_data + q * packn, vl) : vfmv_v_f_f32m1(value, vl); + vfloat32m1_t pad_value = per_channel_pad_data_size ? __riscv_vle32_v_f32m1((const float*)per_channel_pad_data + q * packn, vl) : __riscv_vfmv_v_f_f32m1(value, vl); for (int z = 0; z < outd; z++) { @@ -261,7 +269,7 @@ int Padding_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co { #if __riscv_vector const int packn = csrr_vlenb() / 2; - const size_t vl = vsetvl_e16m1(packn); + const size_t vl = __riscv_vsetvl_e16m1(packn); #endif int w = bottom_blob.w; @@ -291,17 +299,17 @@ int Padding_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co // clang-format off // *INDENT-OFF* vuint16m1_t pad_value; -#if __riscv_zfh +#if NCNN_ZFH if (opt.use_fp16_storage) { - pad_value = vreinterpret_v_f16m1_u16m1(vfmv_v_f_f16m1((__fp16)value, vl)); + pad_value = __riscv_vmv_v_x_u16m1(value_fp16, vl); } else #endif #if NCNN_BF16 if (opt.use_bf16_storage) { - pad_value = vmv_v_x_u16m1(value_bf16, vl); + pad_value = __riscv_vmv_v_x_u16m1(value_bf16, vl); } else #endif @@ -332,17 +340,17 @@ int Padding_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co // clang-format off // *INDENT-OFF* vuint16m1_t pad_value; -#if __riscv_zfh +#if NCNN_ZFH if (opt.use_fp16_storage) { - pad_value = vreinterpret_v_f16m1_u16m1(vfmv_v_f_f16m1((__fp16)value, vl)); + pad_value = __riscv_vmv_v_x_u16m1(value_fp16, vl); } else #endif #if NCNN_BF16 if (opt.use_bf16_storage) { - pad_value = vmv_v_x_u16m1(value_bf16, vl); + pad_value = __riscv_vmv_v_x_u16m1(value_bf16, vl); } else #endif @@ -380,17 +388,17 @@ int Padding_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co // clang-format off // *INDENT-OFF* vuint16m1_t pad_value; -#if __riscv_zfh +#if NCNN_ZFH if (opt.use_fp16_storage) { - pad_value = per_channel_pad_data_size ? vreinterpret_v_f16m1_u16m1(vle16_v_f16m1((const __fp16*)per_channel_pad_data_fp16 + q * packn, vl)) : vreinterpret_v_f16m1_u16m1(vfmv_v_f_f16m1((__fp16)value, vl)); + pad_value = per_channel_pad_data_size ? __riscv_vle16_v_u16m1((const unsigned short*)per_channel_pad_data_fp16 + q * packn, vl) : __riscv_vmv_v_x_u16m1(value_fp16, vl); } else #endif #if NCNN_BF16 if (opt.use_bf16_storage) { - pad_value = per_channel_pad_data_size ? vle16_v_u16m1((const unsigned short*)per_channel_pad_data_bf16 + q * packn, vl) : vmv_v_x_u16m1(value_bf16, vl); + pad_value = per_channel_pad_data_size ? __riscv_vle16_v_u16m1((const unsigned short*)per_channel_pad_data_bf16 + q * packn, vl) : __riscv_vmv_v_x_u16m1(value_bf16, vl); } else #endif @@ -438,17 +446,17 @@ int Padding_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co // clang-format off // *INDENT-OFF* vuint16m1_t pad_value; -#if __riscv_zfh +#if NCNN_ZFH if (opt.use_fp16_storage) { - pad_value = per_channel_pad_data_size ? vreinterpret_v_f16m1_u16m1(vle16_v_f16m1((const __fp16*)per_channel_pad_data_fp16 + q * packn, vl)) : vreinterpret_v_f16m1_u16m1(vfmv_v_f_f16m1((__fp16)value, vl)); + pad_value = per_channel_pad_data_size ? __riscv_vle16_v_u16m1((const unsigned short*)per_channel_pad_data_fp16 + q * packn, vl) : __riscv_vmv_v_x_u16m1(value_fp16, vl); } else #endif #if NCNN_BF16 if (opt.use_bf16_storage) { - pad_value = per_channel_pad_data_size ? vle16_v_u16m1((const unsigned short*)per_channel_pad_data_bf16 + q * packn, vl) : vmv_v_x_u16m1(value_bf16, vl); + pad_value = per_channel_pad_data_size ? __riscv_vle16_v_u16m1((const unsigned short*)per_channel_pad_data_bf16 + q * packn, vl) : __riscv_vmv_v_x_u16m1(value_bf16, vl); } else #endif @@ -511,7 +519,7 @@ int Padding_riscv::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Opt { #if __riscv_vector const int packn = csrr_vlenb() / 1; - const size_t vl = vsetvl_e8m1(packn); + const size_t vl = __riscv_vsetvl_e8m1(packn); #endif int w = bottom_blob.w; @@ -538,7 +546,7 @@ int Padding_riscv::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Opt if (top_blob.empty()) return -100; - vint8m1_t pad_value = vmv_v_x_i8m1((signed char)value, vl); + vint8m1_t pad_value = __riscv_vmv_v_x_i8m1((signed char)value, vl); padding_constant_packn_int8_rvv(bottom_blob, top_blob, 0, 0, left / packn, right / packn, pad_value); return 0; @@ -559,7 +567,7 @@ int Padding_riscv::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Opt if (top_blob.empty()) return -100; - vint8m1_t pad_value = vmv_v_x_i8m1((signed char)value, vl); + vint8m1_t pad_value = __riscv_vmv_v_x_i8m1((signed char)value, vl); padding_constant_packn_int8_rvv(bottom_blob, top_blob, top / packn, bottom / packn, left, right, pad_value); return 0; @@ -588,8 +596,8 @@ int Padding_riscv::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Opt Mat borderm = top_blob.channel(q); // TODO perchannel - // vint8m1_t pad_value = per_channel_pad_data_size ? vle8_v_i8m1(per_channel_pad_data + q * packn) : vmv_v_x_i8m1((signed char)value); - vint8m1_t pad_value = vmv_v_x_i8m1((signed char)value, vl); + // vint8m1_t pad_value = per_channel_pad_data_size ? __riscv_vle8_v_i8m1(per_channel_pad_data + q * packn) : __riscv_vmv_v_x_i8m1((signed char)value); + vint8m1_t pad_value = __riscv_vmv_v_x_i8m1((signed char)value, vl); //Channel padding if ((q - front_) < 0 || (q - front_) >= channels) @@ -628,8 +636,8 @@ int Padding_riscv::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Opt for (int q = 0; q < channels; q++) { // TODO perchannel - // vint8m1_t pad_value = per_channel_pad_data_size ? vle8_v_i8m1(per_channel_pad_data + q * packn) : vmv_v_x_i8m1((signed char)value); - vint8m1_t pad_value = vmv_v_x_i8m1((signed char)value, vl); + // vint8m1_t pad_value = per_channel_pad_data_size ? __riscv_vle8_v_i8m1(per_channel_pad_data + q * packn) : __riscv_vmv_v_x_i8m1((signed char)value); + vint8m1_t pad_value = __riscv_vmv_v_x_i8m1((signed char)value, vl); for (int z = 0; z < outd; z++) { diff --git a/src/layer/riscv/padding_riscv.h b/src/layer/riscv/padding_riscv.h index 7642dccae5f..43de3d2e5e9 100644 --- a/src/layer/riscv/padding_riscv.h +++ b/src/layer/riscv/padding_riscv.h @@ -41,6 +41,7 @@ class Padding_riscv : public Padding #endif // fp16 + unsigned short value_fp16; Mat per_channel_pad_data_fp16; }; diff --git a/src/layer/riscv/pooling_riscv.cpp b/src/layer/riscv/pooling_riscv.cpp index 1b4c1f0ed8a..92f0521bbfd 100644 --- a/src/layer/riscv/pooling_riscv.cpp +++ b/src/layer/riscv/pooling_riscv.cpp @@ -18,9 +18,10 @@ #if __riscv_vector #include +#include "riscv_usability.h" #endif // __riscv_vector -#include "riscv_usability.h" +#include "cpu.h" namespace ncnn { @@ -28,10 +29,14 @@ Pooling_riscv::Pooling_riscv() { #if __riscv_vector support_packing = true; -#if __riscv_zfh - support_fp16_storage = true; -#endif #endif // __riscv_vector +#if NCNN_ZFH +#if __riscv_vector + support_fp16_storage = cpu_support_riscv_zvfh(); +#else + support_fp16_storage = cpu_support_riscv_zfh(); +#endif +#endif } int Pooling_riscv::create_pipeline(const Option& /*opt*/) @@ -55,9 +60,9 @@ int Pooling_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& return Pooling::forward(bottom_blob, top_blob, opt); } +#if NCNN_ZFH int elembits = bottom_blob.elembits(); -#if __riscv_vector && __riscv_zfh if (opt.use_fp16_storage && elembits == 16) { if (opt.use_fp16_arithmetic) @@ -72,7 +77,7 @@ int Pooling_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& #if __riscv_vector const int packn = csrr_vlenb() / 4; - const size_t vl = vsetvl_e32m1(packn); + const size_t vl = __riscv_vsetvl_e32m1(packn); #endif int w = bottom_blob.w; @@ -101,16 +106,16 @@ int Pooling_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& { const float* ptr = bottom_blob.channel(q); - vfloat32m1_t _max = vle32_v_f32m1(ptr, vl); + vfloat32m1_t _max = __riscv_vle32_v_f32m1(ptr, vl); for (int i = 0; i < size; i++) { - vfloat32m1_t _val = vle32_v_f32m1(ptr, vl); - _max = vfmax_vv_f32m1(_max, _val, vl); + vfloat32m1_t _val = __riscv_vle32_v_f32m1(ptr, vl); + _max = __riscv_vfmax_vv_f32m1(_max, _val, vl); ptr += packn; } float* outptr = top_blob; - vse32_v_f32m1(outptr + q * packn, _max, vl); + __riscv_vse32_v_f32m1(outptr + q * packn, _max, vl); } } else if (pooling_type == PoolMethod_AVE) @@ -120,18 +125,18 @@ int Pooling_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& { const float* ptr = bottom_blob.channel(q); - vfloat32m1_t _sum = vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _sum = __riscv_vfmv_v_f_f32m1(0.f, vl); for (int i = 0; i < size; i++) { - vfloat32m1_t _val = vle32_v_f32m1(ptr, vl); - _sum = vfadd_vv_f32m1(_sum, _val, vl); + vfloat32m1_t _val = __riscv_vle32_v_f32m1(ptr, vl); + _sum = __riscv_vfadd_vv_f32m1(_sum, _val, vl); ptr += packn; } - vfloat32m1_t _avg = vfmul_vf_f32m1(_sum, 1.f / size, vl); + vfloat32m1_t _avg = __riscv_vfmul_vf_f32m1(_sum, 1.f / size, vl); float* outptr = top_blob; - vse32_v_f32m1(outptr + q * packn, _avg, vl); + __riscv_vse32_v_f32m1(outptr + q * packn, _avg, vl); } } @@ -188,15 +193,15 @@ int Pooling_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& { const float* sptr = m.row(i * stride_h) + j * stride_w * packn; - vfloat32m1_t _max = vle32_v_f32m1(sptr, vl); + vfloat32m1_t _max = __riscv_vle32_v_f32m1(sptr, vl); for (int k = 0; k < maxk; k++) { - vfloat32m1_t _val = vle32_v_f32m1(sptr + space_ofs[k] * packn, vl); - _max = vfmax_vv_f32m1(_max, _val, vl); + vfloat32m1_t _val = __riscv_vle32_v_f32m1(sptr + space_ofs[k] * packn, vl); + _max = __riscv_vfmax_vv_f32m1(_max, _val, vl); } - vse32_v_f32m1(outptr + j * packn, _max, vl); + __riscv_vse32_v_f32m1(outptr + j * packn, _max, vl); } outptr += outw * packn; @@ -230,7 +235,7 @@ int Pooling_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& { int sx0 = j * stride_w; - vfloat32m1_t _sum = vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _sum = __riscv_vfmv_v_f_f32m1(0.f, vl); int area = 0; for (int ki = 0; ki < kernel_h; ki++) @@ -253,14 +258,14 @@ int Pooling_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& if (sx >= w - pad_right - wtailpad) break; - vfloat32m1_t _val = vle32_v_f32m1(m.row(sy) + sx * packn, vl); - _sum = vfadd_vv_f32m1(_sum, _val, vl); + vfloat32m1_t _val = __riscv_vle32_v_f32m1(m.row(sy) + sx * packn, vl); + _sum = __riscv_vfadd_vv_f32m1(_sum, _val, vl); area += 1; } } - vfloat32m1_t _avg = vfmul_vf_f32m1(_sum, 1.f / area, vl); - vse32_v_f32m1(outptr + j * packn, _avg, vl); + vfloat32m1_t _avg = __riscv_vfmul_vf_f32m1(_sum, 1.f / area, vl); + __riscv_vse32_v_f32m1(outptr + j * packn, _avg, vl); } outptr += outw * packn; @@ -283,16 +288,16 @@ int Pooling_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& { const float* sptr = m.row(i * stride_h) + j * stride_w * packn; - vfloat32m1_t _sum = vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _sum = __riscv_vfmv_v_f_f32m1(0.f, vl); for (int k = 0; k < maxk; k++) { - vfloat32m1_t _val = vle32_v_f32m1(sptr + space_ofs[k] * packn, vl); - _sum = vfadd_vv_f32m1(_sum, _val, vl); + vfloat32m1_t _val = __riscv_vle32_v_f32m1(sptr + space_ofs[k] * packn, vl); + _sum = __riscv_vfadd_vv_f32m1(_sum, _val, vl); } - vfloat32m1_t _avg = vfmul_vf_f32m1(_sum, inv_maxk, vl); - vse32_v_f32m1(outptr + j * packn, _avg, vl); + vfloat32m1_t _avg = __riscv_vfmul_vf_f32m1(_sum, inv_maxk, vl); + __riscv_vse32_v_f32m1(outptr + j * packn, _avg, vl); } outptr += outw * packn; @@ -308,655 +313,4 @@ int Pooling_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& return Pooling::forward(bottom_blob, top_blob, opt); } -#if __riscv_vector && __riscv_zfh -int Pooling_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const -{ - // max value in NxN window - // avg value in NxN window - - const int packn = csrr_vlenb() / 2; - const size_t vl = vsetvl_e16m1(packn); - - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - size_t elemsize = bottom_blob.elemsize; - int elempack = bottom_blob.elempack; - - // NCNN_LOGE("Pooling input %d x %d pad = %d %d %d %d ksize=%d %d stride=%d %d", w, h, pad_left, pad_right, pad_top, pad_bottom, kernel_w, kernel_h, stride_w, stride_h); - - if (global_pooling) - { - top_blob.create(channels, elemsize, elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - int size = w * h; - - if (pooling_type == PoolMethod_MAX) - { - if (elempack == packn) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const __fp16* ptr = bottom_blob.channel(q); - - vfloat16m1_t _max = vfmv_v_f_f16m1((__fp16)-FLT_MAX, vl); - for (int i = 0; i < size; i++) - { - vfloat16m1_t _val = vle16_v_f16m1(ptr, vl); - _max = vfmax_vv_f16m1(_max, _val, vl); - ptr += packn; - } - - __fp16* outptr = top_blob; - vse16_v_f16m1(outptr + q * packn, _max, vl); - } - } - - if (elempack == 1) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const __fp16* ptr = bottom_blob.channel(q); - - __fp16 max = (__fp16)-FLT_MAX; - for (int i = 0; i < size; i++) - { - max = std::max(max, ptr[i]); - } - - __fp16* outptr = top_blob; - outptr[q] = max; - } - } - } - - if (pooling_type == PoolMethod_AVE) - { - if (elempack == packn) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const __fp16* ptr = bottom_blob.channel(q); - - vfloat32m2_t _sum = vfmv_v_f_f32m2(0.f, vl); - for (int i = 0; i < size; i++) - { - vfloat32m2_t _val = vfwcvt_f_f_v_f32m2(vle16_v_f16m1(ptr, vl), vl); - _sum = vfadd_vv_f32m2(_sum, _val, vl); - ptr += packn; - } - - vfloat32m2_t _avg = vfmul_vf_f32m2(_sum, 1.f / size, vl); - - __fp16* outptr = top_blob; - vse16_v_f16m1(outptr + q * packn, vfncvt_f_f_w_f16m1(_avg, vl), vl); - } - } - - if (elempack == 1) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const __fp16* ptr = bottom_blob.channel(q); - - float sum = 0.f; - for (int i = 0; i < size; i++) - { - sum += (float)ptr[i]; - } - - __fp16* outptr = top_blob; - outptr[q] = (__fp16)(sum / size); - } - } - } - - return 0; - } - - Mat bottom_blob_bordered; - make_padding(bottom_blob, bottom_blob_bordered, opt); - if (bottom_blob_bordered.empty()) - return -100; - - w = bottom_blob_bordered.w; - h = bottom_blob_bordered.h; - - int outw = (w - kernel_w) / stride_w + 1; - int outh = (h - kernel_h) / stride_h + 1; - - top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - const int maxk = kernel_w * kernel_h; - - // kernel offsets - std::vector _space_ofs(maxk); - int* space_ofs = &_space_ofs[0]; - { - int p1 = 0; - int p2 = 0; - int gap = w - kernel_w; - for (int i = 0; i < kernel_h; i++) - { - for (int j = 0; j < kernel_w; j++) - { - space_ofs[p1] = p2; - p1++; - p2++; - } - p2 += gap; - } - } - - if (pooling_type == PoolMethod_MAX) - { - if (elempack == packn) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const Mat m = bottom_blob_bordered.channel(q); - __fp16* outptr = top_blob.channel(q); - - for (int i = 0; i < outh; i++) - { - for (int j = 0; j < outw; j++) - { - const __fp16* sptr = m.row(i * stride_h) + j * stride_w * packn; - - vfloat16m1_t _max = vfmv_v_f_f16m1((__fp16)-FLT_MAX, vl); - - for (int k = 0; k < maxk; k++) - { - vfloat16m1_t _val = vle16_v_f16m1(sptr + space_ofs[k] * packn, vl); - _max = vfmax_vv_f16m1(_max, _val, vl); - } - - vse16_v_f16m1(outptr + j * packn, _max, vl); - } - - outptr += outw * packn; - } - } - } - - if (elempack == 1) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const Mat m = bottom_blob_bordered.channel(q); - __fp16* outptr = top_blob.channel(q); - - for (int i = 0; i < outh; i++) - { - for (int j = 0; j < outw; j++) - { - const __fp16* sptr = m.row(i * stride_h) + j * stride_w; - - __fp16 max = (__fp16)-FLT_MAX; - - for (int k = 0; k < maxk; k++) - { - __fp16 val = sptr[space_ofs[k]]; - max = std::max(max, val); - } - - outptr[j] = max; - } - - outptr += outw; - } - } - } - } - - if (pooling_type == PoolMethod_AVE) - { - if (avgpool_count_include_pad == 0) - { - int wtailpad = 0; - int htailpad = 0; - - if (pad_mode == 0) // full padding - { - wtailpad = bottom_blob_bordered.w - bottom_blob.w - pad_left - pad_right; - htailpad = bottom_blob_bordered.h - bottom_blob.h - pad_top - pad_bottom; - } - - if (elempack == packn) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const Mat m = bottom_blob_bordered.channel(q); - __fp16* outptr = top_blob.channel(q); - - for (int i = 0; i < outh; i++) - { - int sy0 = i * stride_h; - - for (int j = 0; j < outw; j++) - { - int sx0 = j * stride_w; - - vfloat32m2_t _sum = vfmv_v_f_f32m2(0.f, vl); - int area = 0; - - for (int ki = 0; ki < kernel_h; ki++) - { - int sy = sy0 + ki; - - if (sy < pad_top) - continue; - - if (sy >= h - pad_bottom - htailpad) - break; - - for (int kj = 0; kj < kernel_w; kj++) - { - int sx = sx0 + kj; - - if (sx < pad_left) - continue; - - if (sx >= w - pad_right - wtailpad) - break; - - vfloat32m2_t _val = vfwcvt_f_f_v_f32m2(vle16_v_f16m1(m.row(sy) + sx * packn, vl), vl); - _sum = vfadd_vv_f32m2(_sum, _val, vl); - area += 1; - } - } - - vfloat32m2_t _avg = vfmul_vf_f32m2(_sum, 1.f / area, vl); - vse16_v_f16m1(outptr + j * packn, vfncvt_f_f_w_f16m1(_avg, vl), vl); - } - - outptr += outw * packn; - } - } - } - - if (elempack == 1) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const Mat m = bottom_blob_bordered.channel(q); - __fp16* outptr = top_blob.channel(q); - - for (int i = 0; i < outh; i++) - { - int sy0 = i * stride_h; - - for (int j = 0; j < outw; j++) - { - int sx0 = j * stride_w; - - float sum = 0.f; - int area = 0; - - for (int ki = 0; ki < kernel_h; ki++) - { - int sy = sy0 + ki; - - if (sy < pad_top) - continue; - - if (sy >= h - pad_bottom - htailpad) - break; - - for (int kj = 0; kj < kernel_w; kj++) - { - int sx = sx0 + kj; - - if (sx < pad_left) - continue; - - if (sx >= w - pad_right - wtailpad) - break; - - float val = (float)(m.row(sy)[sx]); - sum += val; - area += 1; - } - } - - outptr[j] = (__fp16)(sum / area); - } - - outptr += outw; - } - } - } - } - - if (avgpool_count_include_pad == 1) - { - if (elempack == packn) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const Mat m = bottom_blob_bordered.channel(q); - __fp16* outptr = top_blob.channel(q); - - const float inv_maxk = 1.f / maxk; - - for (int i = 0; i < outh; i++) - { - for (int j = 0; j < outw; j++) - { - const __fp16* sptr = m.row(i * stride_h) + j * stride_w * packn; - - vfloat32m2_t _sum = vfmv_v_f_f32m2(0.f, vl); - - for (int k = 0; k < maxk; k++) - { - vfloat32m2_t _val = vfwcvt_f_f_v_f32m2(vle16_v_f16m1(sptr + space_ofs[k] * packn, vl), vl); - _sum = vfadd_vv_f32m2(_sum, _val, vl); - } - - vfloat32m2_t _avg = vfmul_vf_f32m2(_sum, inv_maxk, vl); - vse16_v_f16m1(outptr + j * packn, vfncvt_f_f_w_f16m1(_avg, vl), vl); - } - - outptr += outw * packn; - } - } - } - - if (elempack == 1) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const Mat m = bottom_blob_bordered.channel(q); - __fp16* outptr = top_blob.channel(q); - - for (int i = 0; i < outh; i++) - { - for (int j = 0; j < outw; j++) - { - const __fp16* sptr = m.row(i * stride_h) + j * stride_w; - - float sum = 0.f; - - for (int k = 0; k < maxk; k++) - { - float val = (float)(sptr[space_ofs[k]]); - sum += val; - } - - outptr[j] = (__fp16)(sum / maxk); - } - - outptr += outw; - } - } - } - } - } - - return 0; -} - -int Pooling_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const -{ - // max value in NxN window - // avg value in NxN window - - if (pooling_type == PoolMethod_MAX || global_pooling) - { - return forward_fp16s(bottom_blob, top_blob, opt); - } - - const int packn = csrr_vlenb() / 2; - const size_t vl = vsetvl_e16m1(packn); - - int w = bottom_blob.w; - int h = bottom_blob.h; - int channels = bottom_blob.c; - size_t elemsize = bottom_blob.elemsize; - int elempack = bottom_blob.elempack; - - // NCNN_LOGE("Pooling input %d x %d pad = %d %d %d %d ksize=%d %d stride=%d %d", w, h, pad_left, pad_right, pad_top, pad_bottom, kernel_w, kernel_h, stride_w, stride_h); - - Mat bottom_blob_bordered; - make_padding(bottom_blob, bottom_blob_bordered, opt); - if (bottom_blob_bordered.empty()) - return -100; - - w = bottom_blob_bordered.w; - h = bottom_blob_bordered.h; - - int outw = (w - kernel_w) / stride_w + 1; - int outh = (h - kernel_h) / stride_h + 1; - - top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - const int maxk = kernel_w * kernel_h; - - // kernel offsets - std::vector _space_ofs(maxk); - int* space_ofs = &_space_ofs[0]; - { - int p1 = 0; - int p2 = 0; - int gap = w - kernel_w; - for (int i = 0; i < kernel_h; i++) - { - for (int j = 0; j < kernel_w; j++) - { - space_ofs[p1] = p2; - p1++; - p2++; - } - p2 += gap; - } - } - - if (pooling_type == PoolMethod_AVE) - { - if (avgpool_count_include_pad == 0) - { - int wtailpad = 0; - int htailpad = 0; - - if (pad_mode == 0) // full padding - { - wtailpad = bottom_blob_bordered.w - bottom_blob.w - pad_left - pad_right; - htailpad = bottom_blob_bordered.h - bottom_blob.h - pad_top - pad_bottom; - } - - if (elempack == packn) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const Mat m = bottom_blob_bordered.channel(q); - __fp16* outptr = top_blob.channel(q); - - for (int i = 0; i < outh; i++) - { - int sy0 = i * stride_h; - - for (int j = 0; j < outw; j++) - { - int sx0 = j * stride_w; - - vfloat16m1_t _sum = vfmv_v_f_f16m1(0.f, vl); - int area = 0; - - for (int ki = 0; ki < kernel_h; ki++) - { - int sy = sy0 + ki; - - if (sy < pad_top) - continue; - - if (sy >= h - pad_bottom - htailpad) - break; - - for (int kj = 0; kj < kernel_w; kj++) - { - int sx = sx0 + kj; - - if (sx < pad_left) - continue; - - if (sx >= w - pad_right - wtailpad) - break; - - vfloat16m1_t _val = vle16_v_f16m1(m.row(sy) + sx * packn, vl); - _sum = vfadd_vv_f16m1(_sum, _val, vl); - area += 1; - } - } - - vfloat16m1_t _avg = vfmul_vf_f16m1(_sum, (__fp16)(1.f / area), vl); - vse16_v_f16m1(outptr + j * packn, _avg, vl); - } - - outptr += outw * packn; - } - } - } - - if (elempack == 1) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const Mat m = bottom_blob_bordered.channel(q); - __fp16* outptr = top_blob.channel(q); - - for (int i = 0; i < outh; i++) - { - int sy0 = i * stride_h; - - for (int j = 0; j < outw; j++) - { - int sx0 = j * stride_w; - - __fp16 sum = (__fp16)0.f; - int area = 0; - - for (int ki = 0; ki < kernel_h; ki++) - { - int sy = sy0 + ki; - - if (sy < pad_top) - continue; - - if (sy >= h - pad_bottom - htailpad) - break; - - for (int kj = 0; kj < kernel_w; kj++) - { - int sx = sx0 + kj; - - if (sx < pad_left) - continue; - - if (sx >= w - pad_right - wtailpad) - break; - - __fp16 val = m.row(sy)[sx]; - sum += val; - area += 1; - } - } - - outptr[j] = sum / area; - } - - outptr += outw; - } - } - } - } - - if (avgpool_count_include_pad == 1) - { - if (elempack == packn) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const Mat m = bottom_blob_bordered.channel(q); - __fp16* outptr = top_blob.channel(q); - - const __fp16 inv_maxk = (__fp16)(1.f / maxk); - - for (int i = 0; i < outh; i++) - { - for (int j = 0; j < outw; j++) - { - const __fp16* sptr = m.row(i * stride_h) + j * stride_w * packn; - - vfloat16m1_t _sum = vfmv_v_f_f16m1(0.f, vl); - - for (int k = 0; k < maxk; k++) - { - vfloat16m1_t _val = vle16_v_f16m1(sptr + space_ofs[k] * packn, vl); - _sum = vfadd_vv_f16m1(_sum, _val, vl); - } - - vfloat16m1_t _avg = vfmul_vf_f16m1(_sum, inv_maxk, vl); - vse16_v_f16m1(outptr + j * packn, _avg, vl); - } - - outptr += outw * packn; - } - } - } - - if (elempack == 1) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - const Mat m = bottom_blob_bordered.channel(q); - __fp16* outptr = top_blob.channel(q); - - for (int i = 0; i < outh; i++) - { - for (int j = 0; j < outw; j++) - { - const __fp16* sptr = m.row(i * stride_h) + j * stride_w; - - __fp16 sum = (__fp16)0.f; - - for (int k = 0; k < maxk; k++) - { - __fp16 val = sptr[space_ofs[k]]; - sum += val; - } - - outptr[j] = sum / maxk; - } - - outptr += outw; - } - } - } - } - } - - return 0; -} -#endif // __riscv_vector && __riscv_zfh - } // namespace ncnn diff --git a/src/layer/riscv/pooling_riscv.h b/src/layer/riscv/pooling_riscv.h index e285b58eb19..8f14df8fabb 100644 --- a/src/layer/riscv/pooling_riscv.h +++ b/src/layer/riscv/pooling_riscv.h @@ -28,7 +28,7 @@ class Pooling_riscv : public Pooling virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; protected: -#if __riscv_vector && __riscv_zfh +#if NCNN_ZFH int forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; #endif diff --git a/src/layer/riscv/pooling_riscv_zfh.cpp b/src/layer/riscv/pooling_riscv_zfh.cpp new file mode 100644 index 00000000000..8da0afbb737 --- /dev/null +++ b/src/layer/riscv/pooling_riscv_zfh.cpp @@ -0,0 +1,695 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "pooling_riscv.h" + +#include + +#if __riscv_vector +#include +#include "riscv_usability.h" +#endif // __riscv_vector + +namespace ncnn { + +#if NCNN_ZFH +int Pooling_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ + // max value in NxN window + // avg value in NxN window + +#if __riscv_zvfh + const int packn = csrr_vlenb() / 2; + const size_t vl = __riscv_vsetvl_e16m1(packn); +#endif // __riscv_zvfh + + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + // NCNN_LOGE("Pooling input %d x %d pad = %d %d %d %d ksize=%d %d stride=%d %d", w, h, pad_left, pad_right, pad_top, pad_bottom, kernel_w, kernel_h, stride_w, stride_h); + + if (global_pooling) + { + top_blob.create(channels, elemsize, elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + int size = w * h; + + if (pooling_type == PoolMethod_MAX) + { +#if __riscv_zvfh + if (elempack == packn) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const __fp16* ptr = bottom_blob.channel(q); + + vfloat16m1_t _max = __riscv_vfmv_v_f_f16m1((__fp16)-FLT_MAX, vl); + for (int i = 0; i < size; i++) + { + vfloat16m1_t _val = __riscv_vle16_v_f16m1(ptr, vl); + _max = __riscv_vfmax_vv_f16m1(_max, _val, vl); + ptr += packn; + } + + __fp16* outptr = top_blob; + __riscv_vse16_v_f16m1(outptr + q * packn, _max, vl); + } + } +#endif // __riscv_zvfh + + if (elempack == 1) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const __fp16* ptr = bottom_blob.channel(q); + + __fp16 max = (__fp16)-FLT_MAX; + for (int i = 0; i < size; i++) + { + max = std::max(max, ptr[i]); + } + + __fp16* outptr = top_blob; + outptr[q] = max; + } + } + } + + if (pooling_type == PoolMethod_AVE) + { +#if __riscv_zvfh + if (elempack == packn) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const __fp16* ptr = bottom_blob.channel(q); + + vfloat32m2_t _sum = __riscv_vfmv_v_f_f32m2(0.f, vl); + for (int i = 0; i < size; i++) + { + vfloat32m2_t _val = __riscv_vfwcvt_f_f_v_f32m2(__riscv_vle16_v_f16m1(ptr, vl), vl); + _sum = __riscv_vfadd_vv_f32m2(_sum, _val, vl); + ptr += packn; + } + + vfloat32m2_t _avg = __riscv_vfmul_vf_f32m2(_sum, 1.f / size, vl); + + __fp16* outptr = top_blob; + __riscv_vse16_v_f16m1(outptr + q * packn, __riscv_vfncvt_f_f_w_f16m1(_avg, vl), vl); + } + } +#endif // __riscv_zvfh + + if (elempack == 1) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const __fp16* ptr = bottom_blob.channel(q); + + float sum = 0.f; + for (int i = 0; i < size; i++) + { + sum += (float)ptr[i]; + } + + __fp16* outptr = top_blob; + outptr[q] = (__fp16)(sum / size); + } + } + } + + return 0; + } + + Mat bottom_blob_bordered; + make_padding(bottom_blob, bottom_blob_bordered, opt); + if (bottom_blob_bordered.empty()) + return -100; + + w = bottom_blob_bordered.w; + h = bottom_blob_bordered.h; + + int outw = (w - kernel_w) / stride_w + 1; + int outh = (h - kernel_h) / stride_h + 1; + + top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + const int maxk = kernel_w * kernel_h; + + // kernel offsets + std::vector _space_ofs(maxk); + int* space_ofs = &_space_ofs[0]; + { + int p1 = 0; + int p2 = 0; + int gap = w - kernel_w; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + space_ofs[p1] = p2; + p1++; + p2++; + } + p2 += gap; + } + } + + if (pooling_type == PoolMethod_MAX) + { +#if __riscv_zvfh + if (elempack == packn) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const Mat m = bottom_blob_bordered.channel(q); + __fp16* outptr = top_blob.channel(q); + + for (int i = 0; i < outh; i++) + { + for (int j = 0; j < outw; j++) + { + const __fp16* sptr = m.row(i * stride_h) + j * stride_w * packn; + + vfloat16m1_t _max = __riscv_vfmv_v_f_f16m1((__fp16)-FLT_MAX, vl); + + for (int k = 0; k < maxk; k++) + { + vfloat16m1_t _val = __riscv_vle16_v_f16m1(sptr + space_ofs[k] * packn, vl); + _max = __riscv_vfmax_vv_f16m1(_max, _val, vl); + } + + __riscv_vse16_v_f16m1(outptr + j * packn, _max, vl); + } + + outptr += outw * packn; + } + } + } +#endif // __riscv_zvfh + + if (elempack == 1) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const Mat m = bottom_blob_bordered.channel(q); + __fp16* outptr = top_blob.channel(q); + + for (int i = 0; i < outh; i++) + { + for (int j = 0; j < outw; j++) + { + const __fp16* sptr = m.row(i * stride_h) + j * stride_w; + + __fp16 max = (__fp16)-FLT_MAX; + + for (int k = 0; k < maxk; k++) + { + __fp16 val = sptr[space_ofs[k]]; + max = std::max(max, val); + } + + outptr[j] = max; + } + + outptr += outw; + } + } + } + } + + if (pooling_type == PoolMethod_AVE) + { + if (avgpool_count_include_pad == 0) + { + int wtailpad = 0; + int htailpad = 0; + + if (pad_mode == 0) // full padding + { + wtailpad = bottom_blob_bordered.w - bottom_blob.w - pad_left - pad_right; + htailpad = bottom_blob_bordered.h - bottom_blob.h - pad_top - pad_bottom; + } + +#if __riscv_zvfh + if (elempack == packn) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const Mat m = bottom_blob_bordered.channel(q); + __fp16* outptr = top_blob.channel(q); + + for (int i = 0; i < outh; i++) + { + int sy0 = i * stride_h; + + for (int j = 0; j < outw; j++) + { + int sx0 = j * stride_w; + + vfloat32m2_t _sum = __riscv_vfmv_v_f_f32m2(0.f, vl); + int area = 0; + + for (int ki = 0; ki < kernel_h; ki++) + { + int sy = sy0 + ki; + + if (sy < pad_top) + continue; + + if (sy >= h - pad_bottom - htailpad) + break; + + for (int kj = 0; kj < kernel_w; kj++) + { + int sx = sx0 + kj; + + if (sx < pad_left) + continue; + + if (sx >= w - pad_right - wtailpad) + break; + + vfloat32m2_t _val = __riscv_vfwcvt_f_f_v_f32m2(__riscv_vle16_v_f16m1(m.row(sy) + sx * packn, vl), vl); + _sum = __riscv_vfadd_vv_f32m2(_sum, _val, vl); + area += 1; + } + } + + vfloat32m2_t _avg = __riscv_vfmul_vf_f32m2(_sum, 1.f / area, vl); + __riscv_vse16_v_f16m1(outptr + j * packn, __riscv_vfncvt_f_f_w_f16m1(_avg, vl), vl); + } + + outptr += outw * packn; + } + } + } +#endif // __riscv_zvfh + + if (elempack == 1) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const Mat m = bottom_blob_bordered.channel(q); + __fp16* outptr = top_blob.channel(q); + + for (int i = 0; i < outh; i++) + { + int sy0 = i * stride_h; + + for (int j = 0; j < outw; j++) + { + int sx0 = j * stride_w; + + float sum = 0.f; + int area = 0; + + for (int ki = 0; ki < kernel_h; ki++) + { + int sy = sy0 + ki; + + if (sy < pad_top) + continue; + + if (sy >= h - pad_bottom - htailpad) + break; + + for (int kj = 0; kj < kernel_w; kj++) + { + int sx = sx0 + kj; + + if (sx < pad_left) + continue; + + if (sx >= w - pad_right - wtailpad) + break; + + float val = (float)(m.row(sy)[sx]); + sum += val; + area += 1; + } + } + + outptr[j] = (__fp16)(sum / area); + } + + outptr += outw; + } + } + } + } + + if (avgpool_count_include_pad == 1) + { +#if __riscv_zvfh + if (elempack == packn) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const Mat m = bottom_blob_bordered.channel(q); + __fp16* outptr = top_blob.channel(q); + + const float inv_maxk = 1.f / maxk; + + for (int i = 0; i < outh; i++) + { + for (int j = 0; j < outw; j++) + { + const __fp16* sptr = m.row(i * stride_h) + j * stride_w * packn; + + vfloat32m2_t _sum = __riscv_vfmv_v_f_f32m2(0.f, vl); + + for (int k = 0; k < maxk; k++) + { + vfloat32m2_t _val = __riscv_vfwcvt_f_f_v_f32m2(__riscv_vle16_v_f16m1(sptr + space_ofs[k] * packn, vl), vl); + _sum = __riscv_vfadd_vv_f32m2(_sum, _val, vl); + } + + vfloat32m2_t _avg = __riscv_vfmul_vf_f32m2(_sum, inv_maxk, vl); + __riscv_vse16_v_f16m1(outptr + j * packn, __riscv_vfncvt_f_f_w_f16m1(_avg, vl), vl); + } + + outptr += outw * packn; + } + } + } +#endif // __riscv_zvfh + + if (elempack == 1) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const Mat m = bottom_blob_bordered.channel(q); + __fp16* outptr = top_blob.channel(q); + + for (int i = 0; i < outh; i++) + { + for (int j = 0; j < outw; j++) + { + const __fp16* sptr = m.row(i * stride_h) + j * stride_w; + + float sum = 0.f; + + for (int k = 0; k < maxk; k++) + { + float val = (float)(sptr[space_ofs[k]]); + sum += val; + } + + outptr[j] = (__fp16)(sum / maxk); + } + + outptr += outw; + } + } + } + } + } + + return 0; +} + +int Pooling_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ + // max value in NxN window + // avg value in NxN window + + if (pooling_type == PoolMethod_MAX || global_pooling) + { + return forward_fp16s(bottom_blob, top_blob, opt); + } + +#if __riscv_zvfh + const int packn = csrr_vlenb() / 2; + const size_t vl = __riscv_vsetvl_e16m1(packn); +#endif // __riscv_zvfh + + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + // NCNN_LOGE("Pooling input %d x %d pad = %d %d %d %d ksize=%d %d stride=%d %d", w, h, pad_left, pad_right, pad_top, pad_bottom, kernel_w, kernel_h, stride_w, stride_h); + + Mat bottom_blob_bordered; + make_padding(bottom_blob, bottom_blob_bordered, opt); + if (bottom_blob_bordered.empty()) + return -100; + + w = bottom_blob_bordered.w; + h = bottom_blob_bordered.h; + + int outw = (w - kernel_w) / stride_w + 1; + int outh = (h - kernel_h) / stride_h + 1; + + top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + const int maxk = kernel_w * kernel_h; + + // kernel offsets + std::vector _space_ofs(maxk); + int* space_ofs = &_space_ofs[0]; + { + int p1 = 0; + int p2 = 0; + int gap = w - kernel_w; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + space_ofs[p1] = p2; + p1++; + p2++; + } + p2 += gap; + } + } + + if (pooling_type == PoolMethod_AVE) + { + if (avgpool_count_include_pad == 0) + { + int wtailpad = 0; + int htailpad = 0; + + if (pad_mode == 0) // full padding + { + wtailpad = bottom_blob_bordered.w - bottom_blob.w - pad_left - pad_right; + htailpad = bottom_blob_bordered.h - bottom_blob.h - pad_top - pad_bottom; + } + +#if __riscv_zvfh + if (elempack == packn) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const Mat m = bottom_blob_bordered.channel(q); + __fp16* outptr = top_blob.channel(q); + + for (int i = 0; i < outh; i++) + { + int sy0 = i * stride_h; + + for (int j = 0; j < outw; j++) + { + int sx0 = j * stride_w; + + vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl); + int area = 0; + + for (int ki = 0; ki < kernel_h; ki++) + { + int sy = sy0 + ki; + + if (sy < pad_top) + continue; + + if (sy >= h - pad_bottom - htailpad) + break; + + for (int kj = 0; kj < kernel_w; kj++) + { + int sx = sx0 + kj; + + if (sx < pad_left) + continue; + + if (sx >= w - pad_right - wtailpad) + break; + + vfloat16m1_t _val = __riscv_vle16_v_f16m1(m.row(sy) + sx * packn, vl); + _sum = __riscv_vfadd_vv_f16m1(_sum, _val, vl); + area += 1; + } + } + + vfloat16m1_t _avg = __riscv_vfmul_vf_f16m1(_sum, (__fp16)(1.f / area), vl); + __riscv_vse16_v_f16m1(outptr + j * packn, _avg, vl); + } + + outptr += outw * packn; + } + } + } +#endif // __riscv_zvfh + + if (elempack == 1) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const Mat m = bottom_blob_bordered.channel(q); + __fp16* outptr = top_blob.channel(q); + + for (int i = 0; i < outh; i++) + { + int sy0 = i * stride_h; + + for (int j = 0; j < outw; j++) + { + int sx0 = j * stride_w; + + __fp16 sum = (__fp16)0.f; + int area = 0; + + for (int ki = 0; ki < kernel_h; ki++) + { + int sy = sy0 + ki; + + if (sy < pad_top) + continue; + + if (sy >= h - pad_bottom - htailpad) + break; + + for (int kj = 0; kj < kernel_w; kj++) + { + int sx = sx0 + kj; + + if (sx < pad_left) + continue; + + if (sx >= w - pad_right - wtailpad) + break; + + __fp16 val = m.row(sy)[sx]; + sum += val; + area += 1; + } + } + + outptr[j] = sum / area; + } + + outptr += outw; + } + } + } + } + + if (avgpool_count_include_pad == 1) + { +#if __riscv_zvfh + if (elempack == packn) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const Mat m = bottom_blob_bordered.channel(q); + __fp16* outptr = top_blob.channel(q); + + const __fp16 inv_maxk = (__fp16)(1.f / maxk); + + for (int i = 0; i < outh; i++) + { + for (int j = 0; j < outw; j++) + { + const __fp16* sptr = m.row(i * stride_h) + j * stride_w * packn; + + vfloat16m1_t _sum = __riscv_vfmv_v_f_f16m1((__fp16)0.f, vl); + + for (int k = 0; k < maxk; k++) + { + vfloat16m1_t _val = __riscv_vle16_v_f16m1(sptr + space_ofs[k] * packn, vl); + _sum = __riscv_vfadd_vv_f16m1(_sum, _val, vl); + } + + vfloat16m1_t _avg = __riscv_vfmul_vf_f16m1(_sum, inv_maxk, vl); + __riscv_vse16_v_f16m1(outptr + j * packn, _avg, vl); + } + + outptr += outw * packn; + } + } + } +#endif // __riscv_zvfh + + if (elempack == 1) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const Mat m = bottom_blob_bordered.channel(q); + __fp16* outptr = top_blob.channel(q); + + for (int i = 0; i < outh; i++) + { + for (int j = 0; j < outw; j++) + { + const __fp16* sptr = m.row(i * stride_h) + j * stride_w; + + __fp16 sum = (__fp16)0.f; + + for (int k = 0; k < maxk; k++) + { + __fp16 val = sptr[space_ofs[k]]; + sum += val; + } + + outptr[j] = sum / maxk; + } + + outptr += outw; + } + } + } + } + } + + return 0; +} +#endif // NCNN_ZFH + +} // namespace ncnn diff --git a/src/layer/riscv/prelu_riscv.cpp b/src/layer/riscv/prelu_riscv.cpp index 32cb77023b4..dcf9b84c623 100644 --- a/src/layer/riscv/prelu_riscv.cpp +++ b/src/layer/riscv/prelu_riscv.cpp @@ -18,23 +18,29 @@ #include #endif // __riscv_vector +#include "cpu.h" + namespace ncnn { PReLU_riscv::PReLU_riscv() { #if __riscv_vector support_packing = true; -#if __riscv_zfh - support_fp16_storage = true; +#endif // __riscv_vector +#if NCNN_ZFH +#if __riscv_vector + support_fp16_storage = cpu_support_riscv_zvfh(); +#else + support_fp16_storage = cpu_support_riscv_zfh(); #endif #endif } int PReLU_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const { +#if NCNN_ZFH int elembits = bottom_top_blob.elembits(); -#if __riscv_vector && __riscv_zfh if (opt.use_fp16_storage && elembits == 16) { if (opt.use_fp16_arithmetic) @@ -44,56 +50,68 @@ int PReLU_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const } #endif - int w = bottom_top_blob.w; - int h = bottom_top_blob.h; - int channels = bottom_top_blob.c; - int size = w * h; int elempack = bottom_top_blob.elempack; int dims = bottom_top_blob.dims; -#if __riscv_vector + if (dims == 1) { int w = bottom_top_blob.w; float* ptr = bottom_top_blob; - const float* ptr_slope = slope_data; if (num_slope > 1) { - int n = w * elempack; +#if __riscv_vector + const float* ptr_slope = slope_data; - // #pragma omp parallel for num_threads(opt.num_threads) + int n = w * elempack; while (n > 0) { - size_t vl = vsetvl_e32m8(n); - vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); - vfloat32m8_t _slope = vle32_v_f32m8(ptr_slope, vl); - vbool4_t _lower = vmflt_vf_f32m8_b4(_p, .0f, vl); + size_t vl = __riscv_vsetvl_e32m8(n); + vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr, vl); + vfloat32m8_t _slope = __riscv_vle32_v_f32m8(ptr_slope, vl); + vbool4_t _lower = __riscv_vmflt_vf_f32m8_b4(_p, .0f, vl); - _p = vfmul_vv_f32m8_m(_lower, _p, /*op1*/ _p, _slope, vl); - vse32_v_f32m8(ptr, _p, vl); + _p = __riscv_vfmul_vv_f32m8_mu(_lower, _p, _p, _slope, vl); + __riscv_vse32_v_f32m8(ptr, _p, vl); ptr += vl; ptr_slope += vl; n -= vl; } +#else // __riscv_vector + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + if (ptr[i] < 0) + ptr[i] *= slope_data[i]; + } +#endif // __riscv_vector } else { float slope = slope_data[0]; +#if __riscv_vector int n = w * elempack; - // #pragma omp parallel for num_threads(opt.num_threads) while (n > 0) { - size_t vl = vsetvl_e32m8(n); - vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); - vbool4_t _lower = vmflt_vf_f32m8_b4(_p, .0f, vl); + size_t vl = __riscv_vsetvl_e32m8(n); + vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr, vl); + vbool4_t _lower = __riscv_vmflt_vf_f32m8_b4(_p, .0f, vl); - _p = vfmul_vf_f32m8_m(_lower, _p, /*op1*/ _p, slope, vl); - vse32_v_f32m8(ptr, _p, vl); + _p = __riscv_vfmul_vf_f32m8_mu(_lower, _p, _p, slope, vl); + __riscv_vse32_v_f32m8(ptr, _p, vl); ptr += vl; n -= vl; } +#else // __riscv_vector + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + if (ptr[i] < 0) + ptr[i] *= slope; + } +#endif // __riscv_vector } } @@ -106,6 +124,7 @@ int PReLU_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const for (int i = 0; i < h; i++) { float* ptr = bottom_top_blob.row(i); +#if __riscv_vector if (num_slope > 1) { for (int j = 0; j < w; j++) @@ -115,13 +134,13 @@ int PReLU_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const while (n > 0) { - size_t vl = vsetvl_e32m8(n); - vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); - vfloat32m8_t _slope = vle32_v_f32m8(ptr_slope, vl); + size_t vl = __riscv_vsetvl_e32m8(n); + vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr, vl); + vfloat32m8_t _slope = __riscv_vle32_v_f32m8(ptr_slope, vl); - vbool4_t _lower = vmflt_vf_f32m8_b4(_p, .0f, vl); - _p = vfmul_vv_f32m8_m(_lower, _p, /*op1*/ _p, _slope, vl); - vse32_v_f32m8(ptr, _p, vl); + vbool4_t _lower = __riscv_vmflt_vf_f32m8_b4(_p, .0f, vl); + _p = __riscv_vfmul_vv_f32m8_mu(_lower, _p, _p, _slope, vl); + __riscv_vse32_v_f32m8(ptr, _p, vl); ptr += vl; ptr_slope += vl; @@ -135,114 +154,18 @@ int PReLU_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const int n = w * elempack; while (n > 0) { - size_t vl = vsetvl_e32m8(n); - vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); - vbool4_t _lower = vmflt_vf_f32m8_b4(_p, .0f, vl); - - _p = vfmul_vf_f32m8_m(_lower, _p, /*op1*/ _p, slope, vl); - vse32_v_f32m8(ptr, _p, vl); - - ptr += vl; - n -= vl; - } - } - } - } - - if (dims == 3) - { - int w = bottom_top_blob.w; - int h = bottom_top_blob.h; - int channels = bottom_top_blob.c; - int size = w * h; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - float* ptr = bottom_top_blob.channel(q); - int n = size * elempack; - - if (num_slope > 1 && elempack != 1) - { - while (n > 0) - { - int n1 = elempack; - const float* slope_ptr = (const float*)slope_data + q * elempack; - while (n1 > 0) - { - size_t vl = vsetvl_e32m8(n1); - vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); - vfloat32m8_t _slope = vle32_v_f32m8(slope_ptr, vl); - - vbool4_t _lower = vmflt_vf_f32m8_b4(_p, .0f, vl); - _p = vfmul_vv_f32m8_m(_lower, _p, /*op1*/ _p, _slope, vl); - vse32_v_f32m8(ptr, _p, vl); - - ptr += vl; - slope_ptr += vl; - n1 -= vl; - } - n -= elempack; - } - } - else - { - // num_slope == 1 or elempack ==1 - float slope = num_slope > 1 ? slope_data[q] : slope_data[0]; - while (n > 0) - { - size_t vl = vsetvl_e32m8(n); - vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); + size_t vl = __riscv_vsetvl_e32m8(n); + vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr, vl); + vbool4_t _lower = __riscv_vmflt_vf_f32m8_b4(_p, .0f, vl); - vbool4_t _lower = vmflt_vf_f32m8_b4(_p, .0f, vl); - _p = vfmul_vf_f32m8_m(_lower, _p, /*op1*/ _p, slope, vl); - vse32_v_f32m8(ptr, _p, vl); + _p = __riscv_vfmul_vf_f32m8_mu(_lower, _p, _p, slope, vl); + __riscv_vse32_v_f32m8(ptr, _p, vl); ptr += vl; n -= vl; } } - } - } - -#else - if (dims == 1) - { - int w = bottom_top_blob.w; - - float* ptr = bottom_top_blob; - - if (num_slope > 1) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - if (ptr[i] < 0) - ptr[i] *= slope_data[i]; - } - } - else - { - float slope = slope_data[0]; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - if (ptr[i] < 0) - ptr[i] *= slope; - } - } - } - - if (dims == 2) - { - int w = bottom_top_blob.w; - int h = bottom_top_blob.h; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - float* ptr = bottom_top_blob.row(i); +#else // __riscv_vector float slope = num_slope > 1 ? slope_data[i] : slope_data[0]; for (int j = 0; j < w; j++) @@ -250,6 +173,7 @@ int PReLU_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const if (ptr[j] < 0) ptr[j] *= slope; } +#endif // __riscv_vector } } @@ -264,142 +188,8 @@ int PReLU_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const for (int q = 0; q < channels; q++) { float* ptr = bottom_top_blob.channel(q); - float slope = num_slope > 1 ? slope_data[q] : slope_data[0]; - - for (int i = 0; i < size; i++) - { - if (ptr[i] < 0) - ptr[i] *= slope; - } - } - } - -#endif - - return 0; -} - -#if __riscv_vector && __riscv_zfh -//fp16s(a) -//hint: slope always store as fp32 - -int PReLU_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const -{ - int w = bottom_top_blob.w; - int h = bottom_top_blob.h; - int size = w * h; - int elempack = bottom_top_blob.elempack; - int dims = bottom_top_blob.dims; - - if (dims == 1) - { - int w = bottom_top_blob.w; - __fp16* ptr = bottom_top_blob; - const float* ptr_slope = slope_data; - if (num_slope > 1) - { - int n = w * elempack; - - // #pragma omp parallel for num_threads(opt.num_threads) - while (n > 0) - { - size_t vl = vsetvl_e16m4(n); - - vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr, vl), vl); - vfloat32m8_t _slope = vle32_v_f32m8(ptr_slope, vl); - vbool4_t _lower = vmflt_vf_f32m8_b4(_p, .0f, vl); - _p = vfmul_vv_f32m8_m(_lower, _p, /*op1*/ _p, _slope, vl); - - vse16_v_f16m4(ptr, vfncvt_f_f_w_f16m4(_p, vl), vl); - ptr += vl; - ptr_slope += vl; - n -= vl; - } - } - else - { - float slope = slope_data[0]; - - int n = w * elempack; - // #pragma omp parallel for num_threads(opt.num_threads) - while (n > 0) - { - size_t vl = vsetvl_e16m4(n); - vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr, vl), vl); - vbool4_t _lower = vmflt_vf_f32m8_b4(_p, .0f, vl); - - _p = vfmul_vf_f32m8_m(_lower, _p, /*op1*/ _p, slope, vl); - vse16_v_f16m4(ptr, vfncvt_f_f_w_f16m4(_p, vl), vl); - - ptr += vl; - n -= vl; - } - } - } - - if (dims == 2) - { - int w = bottom_top_blob.w; - int h = bottom_top_blob.h; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - __fp16* ptr = bottom_top_blob.row<__fp16>(i); - if (num_slope > 1) - { - for (int j = 0; j < w; j++) - { - const float* ptr_slope = (const float*)slope_data + i * elempack; - int n = elempack; - - while (n > 0) - { - size_t vl = vsetvl_e16m4(n); - vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr, vl), vl); - vfloat32m8_t _slope = vle32_v_f32m8(ptr_slope, vl); - - vbool4_t _lower = vmflt_vf_f32m8_b4(_p, .0f, vl); - _p = vfmul_vv_f32m8_m(_lower, _p, /*op1*/ _p, _slope, vl); - vse16_v_f16m4(ptr, vfncvt_f_f_w_f16m4(_p, vl), vl); - - ptr += vl; - ptr_slope += vl; - n -= vl; - } - } - } - else - { - float slope = slope_data[0]; - int n = w * elempack; - while (n > 0) - { - size_t vl = vsetvl_e16m4(n); - vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr, vl), vl); - vbool4_t _lower = vmflt_vf_f32m8_b4(_p, .0f, vl); - - _p = vfmul_vf_f32m8_m(_lower, _p, /*op1*/ _p, slope, vl); - vse16_v_f16m4(ptr, vfncvt_f_f_w_f16m4(_p, vl), vl); - ptr += vl; - n -= vl; - } - } - } - } - - if (dims == 3) - { - int w = bottom_top_blob.w; - int h = bottom_top_blob.h; - int channels = bottom_top_blob.c; - int size = w * h; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - __fp16* ptr = bottom_top_blob.channel(q); +#if __riscv_vector int n = size * elempack; if (num_slope > 1 && elempack != 1) @@ -410,13 +200,13 @@ int PReLU_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const float* slope_ptr = (const float*)slope_data + q * elempack; while (n1 > 0) { - size_t vl = vsetvl_e16m4(n1); - vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr, vl), vl); - vfloat32m8_t _slope = vle32_v_f32m8(slope_ptr, vl); + size_t vl = __riscv_vsetvl_e32m8(n1); + vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr, vl); + vfloat32m8_t _slope = __riscv_vle32_v_f32m8(slope_ptr, vl); - vbool4_t _lower = vmflt_vf_f32m8_b4(_p, .0f, vl); - _p = vfmul_vv_f32m8_m(_lower, _p, /*op1*/ _p, _slope, vl); - vse16_v_f16m4(ptr, vfncvt_f_f_w_f16m4(_p, vl), vl); + vbool4_t _lower = __riscv_vmflt_vf_f32m8_b4(_p, .0f, vl); + _p = __riscv_vfmul_vv_f32m8_mu(_lower, _p, _p, _slope, vl); + __riscv_vse32_v_f32m8(ptr, _p, vl); ptr += vl; slope_ptr += vl; @@ -431,187 +221,30 @@ int PReLU_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) float slope = num_slope > 1 ? slope_data[q] : slope_data[0]; while (n > 0) { - size_t vl = vsetvl_e16m4(n); - vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr, vl), vl); - - vbool4_t _lower = vmflt_vf_f32m8_b4(_p, .0f, vl); - _p = vfmul_vf_f32m8_m(_lower, _p, /*op1*/ _p, slope, vl); - vse16_v_f16m4(ptr, vfncvt_f_f_w_f16m4(_p, vl), vl); - - ptr += vl; - n -= vl; - } - } - } - } - - return 0; -} - -int PReLU_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const -{ - int w = bottom_top_blob.w; - int h = bottom_top_blob.h; - int size = w * h; - int elempack = bottom_top_blob.elempack; - int dims = bottom_top_blob.dims; - - if (dims == 1) - { - int w = bottom_top_blob.w; - __fp16* ptr = bottom_top_blob; - const float* ptr_slope = slope_data; - if (num_slope > 1) - { - int n = w * elempack; - - // #pragma omp parallel for num_threads(opt.num_threads) - while (n > 0) - { - size_t vl = vsetvl_e16m4(n); - vfloat16m4_t _p = vle16_v_f16m4(ptr, vl); - vfloat16m4_t _slope = vfncvt_f_f_w_f16m4(vle32_v_f32m8(ptr_slope, vl), vl); - vbool4_t _lower = vmflt_vf_f16m4_b4(_p, .0f, vl); - - _p = vfmul_vv_f16m4_m(_lower, _p, /*op1*/ _p, _slope, vl); - vse16_v_f16m4(ptr, _p, vl); - - ptr += vl; - ptr_slope += vl; - n -= vl; - } - } - else - { - __fp16 slope = slope_data[0]; - - int n = w * elempack; - // #pragma omp parallel for num_threads(opt.num_threads) - while (n > 0) - { - size_t vl = vsetvl_e16m8(n); - vfloat16m8_t _p = vle16_v_f16m8(ptr, vl); - vbool2_t _lower = vmflt_vf_f16m8_b2(_p, .0f, vl); - - _p = vfmul_vf_f16m8_m(_lower, _p, /*op1*/ _p, slope, vl); - vse16_v_f16m8(ptr, _p, vl); - - ptr += vl; - n -= vl; - } - } - } - - if (dims == 2) - { - int w = bottom_top_blob.w; - int h = bottom_top_blob.h; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) - { - __fp16* ptr = bottom_top_blob.row<__fp16>(i); - if (num_slope > 1) - { - for (int j = 0; j < w; j++) - { - const float* ptr_slope = (const float*)slope_data + i * elempack; - int n = elempack; - - while (n > 0) - { - size_t vl = vsetvl_e16m4(n); - vfloat16m4_t _p = vle16_v_f16m4(ptr, vl); - vfloat16m4_t _slope = vfncvt_f_f_w_f16m4(vle32_v_f32m8(ptr_slope, vl), vl); - - vbool4_t _lower = vmflt_vf_f16m4_b4(_p, .0f, vl); - _p = vfmul_vv_f16m4_m(_lower, _p, /*op1*/ _p, _slope, vl); - vse16_v_f16m4(ptr, _p, vl); - - ptr += vl; - ptr_slope += vl; - n -= vl; - } - } - } - else - { - __fp16 slope = slope_data[0]; - int n = w * elempack; - while (n > 0) - { - size_t vl = vsetvl_e16m8(n); - vfloat16m8_t _p = vle16_v_f16m8(ptr, vl); - vbool2_t _lower = vmflt_vf_f16m8_b2(_p, .0f, vl); + size_t vl = __riscv_vsetvl_e32m8(n); + vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr, vl); - _p = vfmul_vf_f16m8_m(_lower, _p, /*op1*/ _p, slope, vl); - vse16_v_f16m8(ptr, _p, vl); + vbool4_t _lower = __riscv_vmflt_vf_f32m8_b4(_p, .0f, vl); + _p = __riscv_vfmul_vf_f32m8_mu(_lower, _p, _p, slope, vl); + __riscv_vse32_v_f32m8(ptr, _p, vl); ptr += vl; n -= vl; } } - } - } - - if (dims == 3) - { - int w = bottom_top_blob.w; - int h = bottom_top_blob.h; - int channels = bottom_top_blob.c; - int size = w * h; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - __fp16* ptr = bottom_top_blob.channel(q); - int n = size * elempack; - - if (num_slope > 1 && elempack != 1) - { - while (n > 0) - { - int n1 = elempack; - const float* slope_ptr = (const float*)slope_data + q * elempack; - while (n1 > 0) - { - size_t vl = vsetvl_e16m4(n1); - vfloat16m4_t _p = vle16_v_f16m4(ptr, vl); - vfloat16m4_t _slope = vfncvt_f_f_w_f16m4(vle32_v_f32m8(slope_ptr, vl), vl); - - vbool4_t _lower = vmflt_vf_f16m4_b4(_p, .0f, vl); - _p = vfmul_vv_f16m4_m(_lower, _p, /*op1*/ _p, _slope, vl); - vse16_v_f16m4(ptr, _p, vl); +#else // __riscv_vector + float slope = num_slope > 1 ? slope_data[q] : slope_data[0]; - ptr += vl; - slope_ptr += vl; - n1 -= vl; - } - n -= elempack; - } - } - else + for (int i = 0; i < size; i++) { - // num_slope == 1 or elempack ==1 - float slope = num_slope > 1 ? slope_data[q] : slope_data[0]; - while (n > 0) - { - size_t vl = vsetvl_e16m8(n); - vfloat16m8_t _p = vle16_v_f16m8(ptr, vl); - - vbool2_t _lower = vmflt_vf_f16m8_b2(_p, .0f, vl); - _p = vfmul_vf_f16m8_m(_lower, _p, /*op1*/ _p, (__fp16)slope, vl); - vse16_v_f16m8(ptr, _p, vl); - - ptr += vl; - n -= vl; - } + if (ptr[i] < 0) + ptr[i] *= slope; } +#endif // __riscv_vector } } return 0; } -#endif } // namespace ncnn diff --git a/src/layer/riscv/prelu_riscv.h b/src/layer/riscv/prelu_riscv.h index 70acbc5d250..4f56f8ce1ab 100644 --- a/src/layer/riscv/prelu_riscv.h +++ b/src/layer/riscv/prelu_riscv.h @@ -26,8 +26,8 @@ class PReLU_riscv : public PReLU virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; -#if __riscv_vector && __riscv_zfh protected: +#if NCNN_ZFH int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const; int forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const; #endif diff --git a/src/layer/riscv/prelu_riscv_zfh.cpp b/src/layer/riscv/prelu_riscv_zfh.cpp new file mode 100644 index 00000000000..7dec88df6b4 --- /dev/null +++ b/src/layer/riscv/prelu_riscv_zfh.cpp @@ -0,0 +1,424 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "prelu_riscv.h" + +#if __riscv_vector +#include +#endif // __riscv_vector + +namespace ncnn { + +#if NCNN_ZFH +//fp16s(a) +//hint: slope always store as fp32 + +int PReLU_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const +{ + int elempack = bottom_top_blob.elempack; + int dims = bottom_top_blob.dims; + + if (dims == 1) + { + int w = bottom_top_blob.w; + __fp16* ptr = bottom_top_blob; + if (num_slope > 1) + { +#if __riscv_zvfh + const float* ptr_slope = slope_data; + + int n = w * elempack; + while (n > 0) + { + size_t vl = __riscv_vsetvl_e16m4(n); + + vfloat32m8_t _p = __riscv_vfwcvt_f_f_v_f32m8(__riscv_vle16_v_f16m4(ptr, vl), vl); + vfloat32m8_t _slope = __riscv_vle32_v_f32m8(ptr_slope, vl); + vbool4_t _lower = __riscv_vmflt_vf_f32m8_b4(_p, (__fp16)0.f, vl); + _p = __riscv_vfmul_vv_f32m8_mu(_lower, _p, _p, _slope, vl); + + __riscv_vse16_v_f16m4(ptr, __riscv_vfncvt_f_f_w_f16m4(_p, vl), vl); + ptr += vl; + ptr_slope += vl; + n -= vl; + } +#else // __riscv_zvfh + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + if (ptr[i] < (__fp16)0.f) + ptr[i] = (__fp16)((float)ptr[i] * slope_data[i]); + } +#endif // __riscv_zvfh + } + else + { + float slope = slope_data[0]; + +#if __riscv_zvfh + int n = w * elempack; + while (n > 0) + { + size_t vl = __riscv_vsetvl_e16m4(n); + vfloat32m8_t _p = __riscv_vfwcvt_f_f_v_f32m8(__riscv_vle16_v_f16m4(ptr, vl), vl); + vbool4_t _lower = __riscv_vmflt_vf_f32m8_b4(_p, (__fp16)0.f, vl); + + _p = __riscv_vfmul_vf_f32m8_mu(_lower, _p, _p, slope, vl); + __riscv_vse16_v_f16m4(ptr, __riscv_vfncvt_f_f_w_f16m4(_p, vl), vl); + + ptr += vl; + n -= vl; + } +#else // __riscv_zvfh + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + if (ptr[i] < (__fp16)0.f) + ptr[i] = (__fp16)((float)ptr[i] * slope); + } +#endif // __riscv_zvfh + } + } + + if (dims == 2) + { + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + __fp16* ptr = bottom_top_blob.row<__fp16>(i); +#if __riscv_zvfh + if (num_slope > 1) + { + for (int j = 0; j < w; j++) + { + const float* ptr_slope = (const float*)slope_data + i * elempack; + int n = elempack; + + while (n > 0) + { + size_t vl = __riscv_vsetvl_e16m4(n); + vfloat32m8_t _p = __riscv_vfwcvt_f_f_v_f32m8(__riscv_vle16_v_f16m4(ptr, vl), vl); + vfloat32m8_t _slope = __riscv_vle32_v_f32m8(ptr_slope, vl); + + vbool4_t _lower = __riscv_vmflt_vf_f32m8_b4(_p, (__fp16)0.f, vl); + _p = __riscv_vfmul_vv_f32m8_mu(_lower, _p, _p, _slope, vl); + __riscv_vse16_v_f16m4(ptr, __riscv_vfncvt_f_f_w_f16m4(_p, vl), vl); + + ptr += vl; + ptr_slope += vl; + n -= vl; + } + } + } + else + { + float slope = slope_data[0]; + int n = w * elempack; + while (n > 0) + { + size_t vl = __riscv_vsetvl_e16m4(n); + vfloat32m8_t _p = __riscv_vfwcvt_f_f_v_f32m8(__riscv_vle16_v_f16m4(ptr, vl), vl); + vbool4_t _lower = __riscv_vmflt_vf_f32m8_b4(_p, (__fp16)0.f, vl); + + _p = __riscv_vfmul_vf_f32m8_mu(_lower, _p, _p, slope, vl); + __riscv_vse16_v_f16m4(ptr, __riscv_vfncvt_f_f_w_f16m4(_p, vl), vl); + + ptr += vl; + n -= vl; + } + } +#else // __riscv_zvfh + float slope = num_slope > 1 ? slope_data[i] : slope_data[0]; + + for (int j = 0; j < w; j++) + { + if (ptr[j] < (__fp16)0.f) + ptr[j] = (__fp16)((float)ptr[j] * slope); + } +#endif // __riscv_zvfh + } + } + + if (dims == 3) + { + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + int channels = bottom_top_blob.c; + int size = w * h; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + __fp16* ptr = bottom_top_blob.channel(q); +#if __riscv_zvfh + int n = size * elempack; + + if (num_slope > 1 && elempack != 1) + { + while (n > 0) + { + int n1 = elempack; + const float* slope_ptr = (const float*)slope_data + q * elempack; + while (n1 > 0) + { + size_t vl = __riscv_vsetvl_e16m4(n1); + vfloat32m8_t _p = __riscv_vfwcvt_f_f_v_f32m8(__riscv_vle16_v_f16m4(ptr, vl), vl); + vfloat32m8_t _slope = __riscv_vle32_v_f32m8(slope_ptr, vl); + + vbool4_t _lower = __riscv_vmflt_vf_f32m8_b4(_p, (__fp16)0.f, vl); + _p = __riscv_vfmul_vv_f32m8_mu(_lower, _p, _p, _slope, vl); + __riscv_vse16_v_f16m4(ptr, __riscv_vfncvt_f_f_w_f16m4(_p, vl), vl); + + ptr += vl; + slope_ptr += vl; + n1 -= vl; + } + n -= elempack; + } + } + else + { + // num_slope == 1 or elempack ==1 + float slope = num_slope > 1 ? slope_data[q] : slope_data[0]; + while (n > 0) + { + size_t vl = __riscv_vsetvl_e16m4(n); + vfloat32m8_t _p = __riscv_vfwcvt_f_f_v_f32m8(__riscv_vle16_v_f16m4(ptr, vl), vl); + + vbool4_t _lower = __riscv_vmflt_vf_f32m8_b4(_p, (__fp16)0.f, vl); + _p = __riscv_vfmul_vf_f32m8_mu(_lower, _p, _p, slope, vl); + __riscv_vse16_v_f16m4(ptr, __riscv_vfncvt_f_f_w_f16m4(_p, vl), vl); + + ptr += vl; + n -= vl; + } + } +#else // __riscv_zvfh + float slope = num_slope > 1 ? slope_data[q] : slope_data[0]; + + for (int i = 0; i < size; i++) + { + if (ptr[i] < (__fp16)0.f) + ptr[i] = (__fp16)((float)ptr[i] * slope); + } +#endif // __riscv_zvfh + } + } + + return 0; +} + +int PReLU_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const +{ + int elempack = bottom_top_blob.elempack; + int dims = bottom_top_blob.dims; + + if (dims == 1) + { + int w = bottom_top_blob.w; + __fp16* ptr = bottom_top_blob; + if (num_slope > 1) + { +#if __riscv_zvfh + const float* ptr_slope = slope_data; + + int n = w * elempack; + while (n > 0) + { + size_t vl = __riscv_vsetvl_e16m4(n); + vfloat16m4_t _p = __riscv_vle16_v_f16m4(ptr, vl); + vfloat16m4_t _slope = __riscv_vfncvt_f_f_w_f16m4(__riscv_vle32_v_f32m8(ptr_slope, vl), vl); + vbool4_t _lower = __riscv_vmflt_vf_f16m4_b4(_p, (__fp16)0.f, vl); + + _p = __riscv_vfmul_vv_f16m4_mu(_lower, _p, _p, _slope, vl); + __riscv_vse16_v_f16m4(ptr, _p, vl); + + ptr += vl; + ptr_slope += vl; + n -= vl; + } +#else // __riscv_zvfh + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + if (ptr[i] < (__fp16)0.f) + ptr[i] *= (__fp16)slope_data[i]; + } +#endif // __riscv_zvfh + } + else + { + __fp16 slope = (__fp16)slope_data[0]; + +#if __riscv_zvfh + int n = w * elempack; + while (n > 0) + { + size_t vl = __riscv_vsetvl_e16m8(n); + vfloat16m8_t _p = __riscv_vle16_v_f16m8(ptr, vl); + vbool2_t _lower = __riscv_vmflt_vf_f16m8_b2(_p, (__fp16)0.f, vl); + + _p = __riscv_vfmul_vf_f16m8_mu(_lower, _p, _p, slope, vl); + __riscv_vse16_v_f16m8(ptr, _p, vl); + + ptr += vl; + n -= vl; + } +#else // __riscv_zvfh + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + if (ptr[i] < (__fp16)0.f) + ptr[i] *= slope; + } +#endif // __riscv_zvfh + } + } + + if (dims == 2) + { + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + __fp16* ptr = bottom_top_blob.row<__fp16>(i); +#if __riscv_zvfh + if (num_slope > 1) + { + for (int j = 0; j < w; j++) + { + const float* ptr_slope = (const float*)slope_data + i * elempack; + int n = elempack; + + while (n > 0) + { + size_t vl = __riscv_vsetvl_e16m4(n); + vfloat16m4_t _p = __riscv_vle16_v_f16m4(ptr, vl); + vfloat16m4_t _slope = __riscv_vfncvt_f_f_w_f16m4(__riscv_vle32_v_f32m8(ptr_slope, vl), vl); + + vbool4_t _lower = __riscv_vmflt_vf_f16m4_b4(_p, (__fp16)0.f, vl); + _p = __riscv_vfmul_vv_f16m4_mu(_lower, _p, _p, _slope, vl); + __riscv_vse16_v_f16m4(ptr, _p, vl); + + ptr += vl; + ptr_slope += vl; + n -= vl; + } + } + } + else + { + __fp16 slope = (__fp16)slope_data[0]; + int n = w * elempack; + while (n > 0) + { + size_t vl = __riscv_vsetvl_e16m8(n); + vfloat16m8_t _p = __riscv_vle16_v_f16m8(ptr, vl); + vbool2_t _lower = __riscv_vmflt_vf_f16m8_b2(_p, (__fp16)0.f, vl); + + _p = __riscv_vfmul_vf_f16m8_mu(_lower, _p, _p, slope, vl); + __riscv_vse16_v_f16m8(ptr, _p, vl); + + ptr += vl; + n -= vl; + } + } +#else // __riscv_zvfh + __fp16 slope = num_slope > 1 ? (__fp16)slope_data[i] : (__fp16)slope_data[0]; + + for (int j = 0; j < w; j++) + { + if (ptr[j] < (__fp16)0.f) + ptr[j] *= slope; + } +#endif // __riscv_zvfh + } + } + + if (dims == 3) + { + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + int channels = bottom_top_blob.c; + int size = w * h; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + __fp16* ptr = bottom_top_blob.channel(q); +#if __riscv_zvfh + int n = size * elempack; + + if (num_slope > 1 && elempack != 1) + { + while (n > 0) + { + int n1 = elempack; + const float* slope_ptr = (const float*)slope_data + q * elempack; + while (n1 > 0) + { + size_t vl = __riscv_vsetvl_e16m4(n1); + vfloat16m4_t _p = __riscv_vle16_v_f16m4(ptr, vl); + vfloat16m4_t _slope = __riscv_vfncvt_f_f_w_f16m4(__riscv_vle32_v_f32m8(slope_ptr, vl), vl); + + vbool4_t _lower = __riscv_vmflt_vf_f16m4_b4(_p, (__fp16)0.f, vl); + _p = __riscv_vfmul_vv_f16m4_mu(_lower, _p, _p, _slope, vl); + __riscv_vse16_v_f16m4(ptr, _p, vl); + + ptr += vl; + slope_ptr += vl; + n1 -= vl; + } + n -= elempack; + } + } + else + { + // num_slope == 1 or elempack ==1 + float slope = num_slope > 1 ? slope_data[q] : slope_data[0]; + while (n > 0) + { + size_t vl = __riscv_vsetvl_e16m8(n); + vfloat16m8_t _p = __riscv_vle16_v_f16m8(ptr, vl); + + vbool2_t _lower = __riscv_vmflt_vf_f16m8_b2(_p, (__fp16)0.f, vl); + _p = __riscv_vfmul_vf_f16m8_mu(_lower, _p, _p, (__fp16)slope, vl); + __riscv_vse16_v_f16m8(ptr, _p, vl); + + ptr += vl; + n -= vl; + } + } +#else // __riscv_zvfh + __fp16 slope = num_slope > 1 ? (__fp16)slope_data[q] : (__fp16)slope_data[0]; + + for (int i = 0; i < size; i++) + { + if (ptr[i] < (__fp16)0.f) + ptr[i] *= slope; + } +#endif // __riscv_zvfh + } + } + + return 0; +} +#endif // NCNN_ZFH + +} // namespace ncnn diff --git a/src/layer/riscv/relu_riscv.cpp b/src/layer/riscv/relu_riscv.cpp index cf2d4057069..fe4291331c7 100644 --- a/src/layer/riscv/relu_riscv.cpp +++ b/src/layer/riscv/relu_riscv.cpp @@ -18,21 +18,27 @@ #include #endif // __riscv_vector +#include "cpu.h" + namespace ncnn { ReLU_riscv::ReLU_riscv() { #if __riscv_vector support_packing = true; -#if __riscv_zfh - support_fp16_storage = true; +#endif +#if NCNN_ZFH +#if __riscv_vector + support_fp16_storage = cpu_support_riscv_zvfh(); +#else + support_fp16_storage = cpu_support_riscv_zfh(); #endif #endif } int ReLU_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const { -#if __riscv_vector && __riscv_zfh +#if NCNN_ZFH int elembits = bottom_top_blob.elembits(); if (opt.use_fp16_storage && elembits == 16) @@ -58,11 +64,11 @@ int ReLU_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const int n = size; while (n > 0) { - size_t vl = vsetvl_e32m8(n); + size_t vl = __riscv_vsetvl_e32m8(n); - vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); - _p = vfmax_vf_f32m8(_p, 0.f, vl); - vse32_v_f32m8(ptr, _p, vl); + vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr, vl); + _p = __riscv_vfmax_vf_f32m8(_p, 0.f, vl); + __riscv_vse32_v_f32m8(ptr, _p, vl); ptr += vl; n -= vl; @@ -82,11 +88,11 @@ int ReLU_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const int n = size; while (n > 0) { - size_t vl = vsetvl_e32m8(n); + size_t vl = __riscv_vsetvl_e32m8(n); - vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); - _p = vfmul_vf_f32m8_m(vmflt_vf_f32m8_b4(_p, .0f, vl), _p, _p, slope, vl); //slope: float(float32_t) - vse32_v_f32m8(ptr, _p, vl); + vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr, vl); + _p = __riscv_vfmul_vf_f32m8_mu(__riscv_vmflt_vf_f32m8_b4(_p, .0f, vl), _p, _p, slope, vl); + __riscv_vse32_v_f32m8(ptr, _p, vl); ptr += vl; n -= vl; @@ -105,55 +111,4 @@ int ReLU_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const return 0; } -#if __riscv_vector && __riscv_zfh -int ReLU_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const -{ - int w = bottom_top_blob.w; - int h = bottom_top_blob.h; - int d = bottom_top_blob.d; - int channels = bottom_top_blob.c; - int elempack = bottom_top_blob.elempack; - int size = w * h * d * elempack; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - __fp16* ptr = bottom_top_blob.channel(q); - if (slope == 0.f) - { - int n = size; - while (n > 0) - { - size_t vl = vsetvl_e16m8(n); - - vfloat16m8_t _p = vle16_v_f16m8(ptr, vl); - _p = vfmax_vf_f16m8(_p, (__fp16)0.f, vl); - vse16_v_f16m8(ptr, _p, vl); - - ptr += vl; - n -= vl; - } - } - else - { - int n = size; - __fp16 _slope = (__fp16)slope; - while (n > 0) - { - size_t vl = vsetvl_e16m8(n); - - vfloat16m8_t _p = vle16_v_f16m8(ptr, vl); - _p = vfmul_vf_f16m8_m(vmflt_vf_f16m8_b2(_p, .0f, vl), _p, _p, _slope, vl); - vse16_v_f16m8(ptr, _p, vl); - - ptr += vl; - n -= vl; - } - } - } - - return 0; -} - -#endif } // namespace ncnn diff --git a/src/layer/riscv/relu_riscv.h b/src/layer/riscv/relu_riscv.h index 58181b533b8..7fae384abd1 100644 --- a/src/layer/riscv/relu_riscv.h +++ b/src/layer/riscv/relu_riscv.h @@ -27,7 +27,7 @@ class ReLU_riscv : public ReLU virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; protected: -#if __riscv_vector && __riscv_zfh +#if NCNN_ZFH int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const; #endif }; diff --git a/src/layer/riscv/relu_riscv_zfh.cpp b/src/layer/riscv/relu_riscv_zfh.cpp new file mode 100644 index 00000000000..b4503421755 --- /dev/null +++ b/src/layer/riscv/relu_riscv_zfh.cpp @@ -0,0 +1,92 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "relu_riscv.h" + +#if __riscv_vector +#include +#endif // __riscv_vector + +namespace ncnn { + +#if NCNN_ZFH +int ReLU_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const +{ + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + int d = bottom_top_blob.d; + int channels = bottom_top_blob.c; + int elempack = bottom_top_blob.elempack; + int size = w * h * d * elempack; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + __fp16* ptr = bottom_top_blob.channel(q); + if (slope == 0.f) + { +#if __riscv_zvfh + int n = size; + while (n > 0) + { + size_t vl = __riscv_vsetvl_e16m8(n); + + vfloat16m8_t _p = __riscv_vle16_v_f16m8(ptr, vl); + _p = __riscv_vfmax_vf_f16m8(_p, (__fp16)0.f, vl); + __riscv_vse16_v_f16m8(ptr, _p, vl); + + ptr += vl; + n -= vl; + } +#else // __riscv_zvfh + for (int i = 0; i < size; i++) + { + if (*ptr < (__fp16)0.f) + *ptr = (__fp16)0.f; + ptr++; + } +#endif // __riscv_zvfh + } + else + { + __fp16 _slope = (__fp16)slope; +#if __riscv_zvfh + int n = size; + while (n > 0) + { + size_t vl = __riscv_vsetvl_e16m8(n); + + vfloat16m8_t _p = __riscv_vle16_v_f16m8(ptr, vl); + _p = __riscv_vfmul_vf_f16m8_mu(__riscv_vmflt_vf_f16m8_b2(_p, (__fp16)0.f, vl), _p, _p, _slope, vl); + __riscv_vse16_v_f16m8(ptr, _p, vl); + + ptr += vl; + n -= vl; + } +#else // __riscv_zvfh + for (int i = 0; i < size; i++) + { + if (*ptr < (__fp16)0.f) + *ptr *= _slope; + ptr++; + } +#endif // __riscv_zvfh + } + } + + return 0; +} +#endif // NCNN_ZFH + +} // namespace ncnn diff --git a/src/layer/riscv/riscv_activation.h b/src/layer/riscv/riscv_activation.h index d5f114f3aaa..1d0f7476321 100644 --- a/src/layer/riscv/riscv_activation.h +++ b/src/layer/riscv/riscv_activation.h @@ -20,61 +20,63 @@ #if __riscv_vector #include #include "rvv_mathfun.h" +#if __riscv_zvfh #include "rvv_mathfun_fp16s.h" +#endif -#define _RVV_FLOAT_ACTIVATION_PS(SEW, LMUL, MLEN) \ - static inline vfloat##SEW##m##LMUL##_t activation_ps(vfloat##SEW##m##LMUL##_t _v, int activation_type, const ncnn::Mat& activation_params, size_t vl) \ - { \ - if (activation_type == 1) \ - { \ - _v = vfmax_vf_f##SEW##m##LMUL(_v, 0.f, vl); \ - } \ - else if (activation_type == 2) \ - { \ - vbool##MLEN##_t _lemask = vmfle_vf_f##SEW##m##LMUL##_b##MLEN(_v, 0.f, vl); \ - _v = vfmul_vf_f##SEW##m##LMUL##_m(_lemask, _v, _v, activation_params[0], vl); \ - } \ - else if (activation_type == 3) \ - { \ - _v = vfmax_vf_f##SEW##m##LMUL(_v, activation_params[0], vl); \ - _v = vfmin_vf_f##SEW##m##LMUL(_v, activation_params[1], vl); \ - } \ - else if (activation_type == 4) \ - { \ - _v = sigmoid_ps(_v, vl); \ - } \ - else if (activation_type == 5) \ - { \ - _v = vfmul_vv_f##SEW##m##LMUL(_v, tanh_ps(log_ps(vfadd_vf_f##SEW##m##LMUL(exp_ps(_v, vl), 1.f, vl), vl), vl), vl); \ - } \ - else if (activation_type == 6) \ - { \ - const float alpha = activation_params[0]; \ - const float beta = activation_params[1]; \ - const float lower = -beta / alpha; \ - const float upper = (1.f / alpha) + lower; \ - vbool##MLEN##_t _lower = vmflt_vf_f##SEW##m##LMUL##_b##MLEN(_v, lower, vl); \ - vbool##MLEN##_t _higher = vmfgt_vf_f##SEW##m##LMUL##_b##MLEN(_v, upper, vl); \ - vbool##MLEN##_t _apply = vmnor_mm_b##MLEN(_lower, _higher, vl); \ - _v = vfmerge_vfm_f##SEW##m##LMUL(_lower, _v, .0f, vl); \ - \ - vfloat##SEW##m##LMUL##_t _p0 = vfadd_vf_f##SEW##m##LMUL##_m( \ - _apply, _v, /*op1*/ vfmul_vf_f##SEW##m##LMUL##_m(_apply, _v, _v, alpha, vl), beta, \ - vl); \ - _v = vfmul_vv_f##SEW##m##LMUL##_m(_apply, _v, /*op1*/ _v, _p0, vl); \ - } \ - \ - return _v; \ +#define _RVV_FLOAT_ACTIVATION_PS(SEW, LMUL, MLEN, STYPE) \ + static inline vfloat##SEW##m##LMUL##_t activation_ps(vfloat##SEW##m##LMUL##_t _v, int activation_type, const ncnn::Mat& activation_params, size_t vl) \ + { \ + if (activation_type == 1) \ + { \ + _v = __riscv_vfmax_vf_f##SEW##m##LMUL(_v, (STYPE)0.f, vl); \ + } \ + else if (activation_type == 2) \ + { \ + vbool##MLEN##_t _lemask = __riscv_vmfle_vf_f##SEW##m##LMUL##_b##MLEN(_v, (STYPE)0.f, vl); \ + _v = __riscv_vfmul_vf_f##SEW##m##LMUL##_mu(_lemask, _v, _v, (STYPE)activation_params[0], vl); \ + } \ + else if (activation_type == 3) \ + { \ + _v = __riscv_vfmax_vf_f##SEW##m##LMUL(_v, (STYPE)activation_params[0], vl); \ + _v = __riscv_vfmin_vf_f##SEW##m##LMUL(_v, (STYPE)activation_params[1], vl); \ + } \ + else if (activation_type == 4) \ + { \ + _v = sigmoid_ps(_v, vl); \ + } \ + else if (activation_type == 5) \ + { \ + _v = __riscv_vfmul_vv_f##SEW##m##LMUL(_v, tanh_ps(log_ps(__riscv_vfadd_vf_f##SEW##m##LMUL(exp_ps(_v, vl), (STYPE)1.f, vl), vl), vl), vl); \ + } \ + else if (activation_type == 6) \ + { \ + const float alpha = activation_params[0]; \ + const float beta = activation_params[1]; \ + const float lower = -beta / alpha; \ + const float upper = (1.f / alpha) + lower; \ + vbool##MLEN##_t _lower = __riscv_vmflt_vf_f##SEW##m##LMUL##_b##MLEN(_v, (STYPE)lower, vl); \ + vbool##MLEN##_t _higher = __riscv_vmfgt_vf_f##SEW##m##LMUL##_b##MLEN(_v, (STYPE)upper, vl); \ + vbool##MLEN##_t _apply = __riscv_vmnor_mm_b##MLEN(_lower, _higher, vl); \ + _v = __riscv_vfmerge_vfm_f##SEW##m##LMUL(_v, (STYPE).0f, _lower, vl); \ + \ + vfloat##SEW##m##LMUL##_t _p0 = __riscv_vfadd_vf_f##SEW##m##LMUL##_m(_apply, __riscv_vfmul_vf_f##SEW##m##LMUL##_m(_apply, _v, (STYPE)alpha, vl), (STYPE)beta, vl); \ + _v = __riscv_vfmul_vv_f##SEW##m##LMUL##_mu(_apply, _v, _v, _p0, vl); \ + } \ + \ + return _v; \ } -_RVV_FLOAT_ACTIVATION_PS(16, 1, 16) -_RVV_FLOAT_ACTIVATION_PS(16, 2, 8) -_RVV_FLOAT_ACTIVATION_PS(16, 4, 4) -_RVV_FLOAT_ACTIVATION_PS(16, 8, 2) -_RVV_FLOAT_ACTIVATION_PS(32, 1, 32) -_RVV_FLOAT_ACTIVATION_PS(32, 2, 16) -_RVV_FLOAT_ACTIVATION_PS(32, 4, 8) -_RVV_FLOAT_ACTIVATION_PS(32, 8, 4) +#if __riscv_zvfh +_RVV_FLOAT_ACTIVATION_PS(16, 1, 16, __fp16) +_RVV_FLOAT_ACTIVATION_PS(16, 2, 8, __fp16) +_RVV_FLOAT_ACTIVATION_PS(16, 4, 4, __fp16) +_RVV_FLOAT_ACTIVATION_PS(16, 8, 2, __fp16) +#endif +_RVV_FLOAT_ACTIVATION_PS(32, 1, 32, float) +_RVV_FLOAT_ACTIVATION_PS(32, 2, 16, float) +_RVV_FLOAT_ACTIVATION_PS(32, 4, 8, float) +_RVV_FLOAT_ACTIVATION_PS(32, 8, 4, float) #endif // __riscv_vector diff --git a/src/layer/riscv/riscv_usability.h b/src/layer/riscv/riscv_usability.h index e2824646f87..16896601599 100644 --- a/src/layer/riscv/riscv_usability.h +++ b/src/layer/riscv/riscv_usability.h @@ -50,10 +50,10 @@ static inline int csrr_vlenb() return a; } -static inline vfloat32m8_t vle32_v_f32m8_f32m1(const float* ptr) +static inline vfloat32m8_t __riscv_vle32_v_f32m8_f32m1(const float* ptr) { const int packn = csrr_vlenb() / 4; - const size_t vl = vsetvl_e32m8(packn * 8); + const size_t vl = __riscv_vsetvl_e32m8(packn * 8); // NOTE vloxei8_v_f32m8 gets illegal instruction on d1 --- nihui @@ -82,15 +82,15 @@ static inline vfloat32m8_t vle32_v_f32m8_f32m1(const float* ptr) }; const uint32_t* index = packn == 4 ? index_128bit : index_256bit; - vuint32m8_t bindex = vle32_v_u32m8(index, vl); - return vloxei32_v_f32m8(ptr, bindex, vl); + vuint32m8_t bindex = __riscv_vle32_v_u32m8(index, vl); + return __riscv_vloxei32_v_f32m8(ptr, bindex, vl); } -#if __riscv_zfh -static inline vfloat16m8_t vle16_v_f16m8_f16m1(const __fp16* ptr) +#if __riscv_zvfh +static inline vfloat16m8_t __riscv_vle16_v_f16m8_f16m1(const __fp16* ptr) { const int packn = csrr_vlenb() / 2; - const size_t vl = vsetvl_e16m8(packn * 8); + const size_t vl = __riscv_vsetvl_e16m8(packn * 8); // NOTE vloxei8_v_f16m8 gets illegal instruction on d1 --- nihui @@ -119,287 +119,10 @@ static inline vfloat16m8_t vle16_v_f16m8_f16m1(const __fp16* ptr) }; const uint16_t* index = packn == 8 ? index_128bit : index_256bit; - vuint16m8_t bindex = vle16_v_u16m8(index, vl); - return vloxei16_v_f16m8(ptr, bindex, vl); -} -#endif // __riscv_zfh -#endif // __riscv_vector - -#if __riscv_vector && __rvv_tuple - -// f32m1, vsseg.v -static inline void vsseg8e32_v_f32m1(float32_t* base, vfloat32m1_t v0, vfloat32m1_t v1, vfloat32m1_t v2, vfloat32m1_t v3, vfloat32m1_t v4, vfloat32m1_t v5, vfloat32m1_t v6, vfloat32m1_t v7, size_t vl) -{ - vfloat32m1x8_t _tmp = vcreate_f32m1x8(v0, v1, v2, v3, v4, v5, v6, v7); - vsseg8e32_v_f32m1x8(base, _tmp, vl); -} - -static inline void vsseg4e32_v_f32m1(float32_t* base, vfloat32m1_t v0, vfloat32m1_t v1, vfloat32m1_t v2, vfloat32m1_t v3, size_t vl) -{ - vfloat32m1x4_t _tmp = vcreate_f32m1x4(v0, v1, v2, v3); - vsseg4e32_v_f32m1x4(base, _tmp, vl); -} - -static inline void vsseg2e32_v_f32m1(float32_t* base, vfloat32m1_t v0, vfloat32m1_t v1, size_t vl) -{ - vfloat32m1x2_t _tmp = vcreate_f32m1x2(v0, v1); - vsseg2e32_v_f32m1x2(base, _tmp, vl); -} - -// f32m1, vssseg.v, 8/4/2 -static inline void vssseg8e32_v_f32m1(float32_t* base, ptrdiff_t bstride, vfloat32m1_t v0, vfloat32m1_t v1, vfloat32m1_t v2, vfloat32m1_t v3, vfloat32m1_t v4, vfloat32m1_t v5, vfloat32m1_t v6, vfloat32m1_t v7, size_t vl) -{ - vfloat32m1x8_t _tmp = vcreate_f32m1x8(v0, v1, v2, v3, v4, v5, v6, v7); - vssseg8e32_v_f32m1x8(base, bstride, _tmp, vl); -} - -static inline void vssseg4e32_v_f32m1(float32_t* base, ptrdiff_t bstride, vfloat32m1_t v0, vfloat32m1_t v1, vfloat32m1_t v2, vfloat32m1_t v3, size_t vl) -{ - vfloat32m1x4_t _tmp = vcreate_f32m1x4(v0, v1, v2, v3); - vssseg4e32_v_f32m1x4(base, bstride, _tmp, vl); -} - -static inline void vssseg2e32_v_f32m1(float32_t* base, ptrdiff_t bstride, vfloat32m1_t v0, vfloat32m1_t v1, size_t vl) -{ - vfloat32m1x2_t _tmp = vcreate_f32m1x2(v0, v1); - vssseg2e32_v_f32m1x2(base, bstride, _tmp, vl); -} - -// f32m2, vsseg.v, 4/2 -static inline void vsseg4e32_v_f32m2(float32_t* base, vfloat32m2_t v0, vfloat32m2_t v1, vfloat32m2_t v2, vfloat32m2_t v3, size_t vl) -{ - vfloat32m2x4_t _tmp = vcreate_f32m2x4(v0, v1, v2, v3); - vsseg4e32_v_f32m2x4(base, _tmp, vl); -} - -static inline void vsseg2e32_v_f32m2(float32_t* base, vfloat32m2_t v0, vfloat32m2_t v1, size_t vl) -{ - vfloat32m2x2_t _tmp = vcreate_f32m2x2(v0, v1); - vsseg2e32_v_f32m2x2(base, _tmp, vl); -} - -// u16m1, vsseg.v, 8/4 -static inline void vsseg8e16_v_u16m1(uint16_t* base, vuint16m1_t v0, vuint16m1_t v1, vuint16m1_t v2, vuint16m1_t v3, vuint16m1_t v4, vuint16m1_t v5, vuint16m1_t v6, vuint16m1_t v7, size_t vl) -{ - vuint16m1x8_t _tmp = vcreate_u16m1x8(v0, v1, v2, v3, v4, v5, v6, v7); - vsseg8e16_v_u16m1x8(base, _tmp, vl); -} - -static inline void vsseg4e16_v_u16m1(uint16_t* base, vuint16m1_t v0, vuint16m1_t v1, vuint16m1_t v2, vuint16m1_t v3, size_t vl) -{ - vuint16m1x4_t _tmp = vcreate_u16m1x4(v0, v1, v2, v3); - vsseg4e16_v_u16m1x4(base, _tmp, vl); -} - -// u16m2, vsseg.v, 4/2 -static inline void vsseg4e16_v_u16m2(uint16_t* base, vuint16m2_t v0, vuint16m2_t v1, vuint16m2_t v2, vuint16m2_t v3, size_t vl) -{ - vuint16m2x4_t _tmp = vcreate_u16m2x4(v0, v1, v2, v3); - vsseg4e16_v_u16m2x4(base, _tmp, vl); -} - -static inline void vsseg2e16_v_u16m2(uint16_t* base, vuint16m2_t v0, vuint16m2_t v1, size_t vl) -{ - vuint16m2x2_t _tmp = vcreate_u16m2x2(v0, v1); - vsseg2e16_v_u16m2x2(base, _tmp, vl); -} - -// f32m1, vlseg.v 8/4/2 -static inline void vlseg8e32_v_f32m1(vfloat32m1_t* v0, vfloat32m1_t* v1, vfloat32m1_t* v2, vfloat32m1_t* v3, vfloat32m1_t* v4, vfloat32m1_t* v5, vfloat32m1_t* v6, vfloat32m1_t* v7, const float32_t* base, size_t vl) -{ - vfloat32m1x8_t _tmp = vlseg8e32_v_f32m1x8(base, vl); - *v0 = vget_f32m1x8_f32m1(_tmp, 0); - *v1 = vget_f32m1x8_f32m1(_tmp, 1); - *v2 = vget_f32m1x8_f32m1(_tmp, 2); - *v3 = vget_f32m1x8_f32m1(_tmp, 3); - *v4 = vget_f32m1x8_f32m1(_tmp, 4); - *v5 = vget_f32m1x8_f32m1(_tmp, 5); - *v6 = vget_f32m1x8_f32m1(_tmp, 6); - *v7 = vget_f32m1x8_f32m1(_tmp, 7); -} - -static inline void vlseg4e32_v_f32m1(vfloat32m1_t* v0, vfloat32m1_t* v1, vfloat32m1_t* v2, vfloat32m1_t* v3, const float32_t* base, size_t vl) -{ - vfloat32m1x4_t _tmp = vlseg4e32_v_f32m1x4(base, vl); - *v0 = vget_f32m1x4_f32m1(_tmp, 0); - *v1 = vget_f32m1x4_f32m1(_tmp, 1); - *v2 = vget_f32m1x4_f32m1(_tmp, 2); - *v3 = vget_f32m1x4_f32m1(_tmp, 3); -} - -static inline void vlseg2e32_v_f32m1(vfloat32m1_t* v0, vfloat32m1_t* v1, const float32_t* base, size_t vl) -{ - vfloat32m1x2_t _tmp = vlseg2e32_v_f32m1x2(base, vl); - *v0 = vget_f32m1x2_f32m1(_tmp, 0); - *v1 = vget_f32m1x2_f32m1(_tmp, 1); -} - -// f32m2, vlseg.v, 4 -static inline void vlseg4e32_v_f32m2(vfloat32m2_t* v0, vfloat32m2_t* v1, vfloat32m2_t* v2, vfloat32m2_t* v3, const float32_t* base, size_t vl) -{ - vfloat32m2x4_t _tmp = vlseg4e32_v_f32m2x4(base, vl); - *v0 = vget_f32m2x4_f32m2(_tmp, 0); - *v1 = vget_f32m2x4_f32m2(_tmp, 1); - *v2 = vget_f32m2x4_f32m2(_tmp, 2); - *v3 = vget_f32m2x4_f32m2(_tmp, 3); -} - -// f32m4, vlseg.v, 2 -static inline void vlseg2e32_v_f32m4(vfloat32m4_t* v0, vfloat32m4_t* v1, const float32_t* base, size_t vl) -{ - vfloat32m4x2_t _tmp = vlseg2e32_v_f32m4x2(base, vl); - *v0 = vget_f32m4x2_f32m4(_tmp, 0); - *v1 = vget_f32m4x2_f32m4(_tmp, 1); -} - -// f32m4, vloxseg.v -static inline void vloxseg2ei32_v_f32m4(vfloat32m4_t* v0, vfloat32m4_t* v1, const float32_t* base, vuint32m4_t bindex, size_t vl) -{ - vfloat32m4x2_t _tmp = vloxseg2ei32_v_f32m4x2(base, bindex, vl); - *v0 = vget_f32m4x2_f32m4(_tmp, 0); - *v1 = vget_f32m4x2_f32m4(_tmp, 1); + vuint16m8_t bindex = __riscv_vle16_v_u16m8(index, vl); + return __riscv_vloxei16_v_f16m8(ptr, bindex, vl); } - -// u16m1, vlseg.v 8/4/2 -static inline void vlseg8e16_v_u16m1(vuint16m1_t* v0, vuint16m1_t* v1, vuint16m1_t* v2, vuint16m1_t* v3, vuint16m1_t* v4, vuint16m1_t* v5, vuint16m1_t* v6, vuint16m1_t* v7, const uint16_t* base, size_t vl) -{ - vuint16m1x8_t _tmp = vlseg8e16_v_u16m1x8(base, vl); - *v0 = vget_u16m1x8_u16m1(_tmp, 0); - *v1 = vget_u16m1x8_u16m1(_tmp, 1); - *v2 = vget_u16m1x8_u16m1(_tmp, 2); - *v3 = vget_u16m1x8_u16m1(_tmp, 3); - *v4 = vget_u16m1x8_u16m1(_tmp, 4); - *v5 = vget_u16m1x8_u16m1(_tmp, 5); - *v6 = vget_u16m1x8_u16m1(_tmp, 6); - *v7 = vget_u16m1x8_u16m1(_tmp, 7); -} - -static inline void vlseg4e16_v_u16m1(vuint16m1_t* v0, vuint16m1_t* v1, vuint16m1_t* v2, vuint16m1_t* v3, const uint16_t* base, size_t vl) -{ - vuint16m1x4_t _tmp = vlseg4e16_v_u16m1x4(base, vl); - *v0 = vget_u16m1x4_u16m1(_tmp, 0); - *v1 = vget_u16m1x4_u16m1(_tmp, 1); - *v2 = vget_u16m1x4_u16m1(_tmp, 2); - *v3 = vget_u16m1x4_u16m1(_tmp, 3); -} - -static inline void vlseg2e16_v_u16m1(vuint16m1_t* v0, vuint16m1_t* v1, const uint16_t* base, size_t vl) -{ - vuint16m1x2_t _tmp = vlseg2e16_v_u16m1x2(base, vl); - *v0 = vget_u16m1x2_u16m1(_tmp, 0); - *v1 = vget_u16m1x2_u16m1(_tmp, 1); -} - -// u16m2, vlseg.v, 4 -static inline void vlseg4e16_v_u16m2(vuint16m2_t* v0, vuint16m2_t* v1, vuint16m2_t* v2, vuint16m2_t* v3, const uint16_t* base, size_t vl) -{ - vuint16m2x4_t _tmp = vlseg4e16_v_u16m2x4(base, vl); - *v0 = vget_u16m2x4_u16m2(_tmp, 0); - *v1 = vget_u16m2x4_u16m2(_tmp, 1); - *v2 = vget_u16m2x4_u16m2(_tmp, 2); - *v3 = vget_u16m2x4_u16m2(_tmp, 3); -} - -// u16m4, vlseg.v, 2 -static inline void vlseg2e16_v_u16m4(vuint16m4_t* v0, vuint16m4_t* v1, const uint16_t* base, size_t vl) -{ - vuint16m4x2_t _tmp = vlseg2e16_v_u16m4x2(base, vl); - *v0 = vget_u16m4x2_u16m4(_tmp, 0); - *v1 = vget_u16m4x2_u16m4(_tmp, 1); -} - -#if __riscv_zfh - -// f16m1, vsseg.v, 8/4/2 -static inline void vsseg8e16_v_f16m1(float16_t* base, vfloat16m1_t v0, vfloat16m1_t v1, vfloat16m1_t v2, vfloat16m1_t v3, vfloat16m1_t v4, vfloat16m1_t v5, vfloat16m1_t v6, vfloat16m1_t v7, size_t vl) -{ - vfloat16m1x8_t _tmp = vcreate_f16m1x8(v0, v1, v2, v3, v4, v5, v6, v7); - vsseg8e16_v_f16m1x8(base, _tmp, vl); -} - -static inline void vsseg4e16_v_f16m1(float16_t* base, vfloat16m1_t v0, vfloat16m1_t v1, vfloat16m1_t v2, vfloat16m1_t v3, size_t vl) -{ - vfloat16m1x4_t _tmp = vcreate_f16m1x4(v0, v1, v2, v3); - vsseg4e16_v_f16m1x4(base, _tmp, vl); -} - -static inline void vsseg2e16_v_f16m1(float16_t* base, vfloat16m1_t v0, vfloat16m1_t v1, size_t vl) -{ - vfloat16m1x2_t _tmp = vcreate_f16m1x2(v0, v1); - vsseg2e16_v_f16m1x2(base, _tmp, vl); -} - -// f16m1, vssseg.v, 8/4/2 -static inline void vssseg8e16_v_f16m1(float16_t* base, ptrdiff_t bstride, vfloat16m1_t v0, vfloat16m1_t v1, vfloat16m1_t v2, vfloat16m1_t v3, vfloat16m1_t v4, vfloat16m1_t v5, vfloat16m1_t v6, vfloat16m1_t v7, size_t vl) -{ - vfloat16m1x8_t _tmp = vcreate_f16m1x8(v0, v1, v2, v3, v4, v5, v6, v7); - vssseg8e16_v_f16m1x8(base, bstride, _tmp, vl); -} - -static inline void vssseg4e16_v_f16m1(float16_t* base, ptrdiff_t bstride, vfloat16m1_t v0, vfloat16m1_t v1, vfloat16m1_t v2, vfloat16m1_t v3, size_t vl) -{ - vfloat16m1x4_t _tmp = vcreate_f16m1x4(v0, v1, v2, v3); - vssseg4e16_v_f16m1x4(base, bstride, _tmp, vl); -} - -static inline void vssseg2e16_v_f16m1(float16_t* base, ptrdiff_t bstride, vfloat16m1_t v0, vfloat16m1_t v1, size_t vl) -{ - vfloat16m1x2_t _tmp = vcreate_f16m1x2(v0, v1); - vssseg2e16_v_f16m1x2(base, bstride, _tmp, vl); -} - -// f16m1, vlseg.v 8/4/2 -static inline void vlseg8e16_v_f16m1(vfloat16m1_t* v0, vfloat16m1_t* v1, vfloat16m1_t* v2, vfloat16m1_t* v3, vfloat16m1_t* v4, vfloat16m1_t* v5, vfloat16m1_t* v6, vfloat16m1_t* v7, const float16_t* base, size_t vl) -{ - vfloat16m1x8_t _tmp = vlseg8e16_v_f16m1x8(base, vl); - *v0 = vget_f16m1x8_f16m1(_tmp, 0); - *v1 = vget_f16m1x8_f16m1(_tmp, 1); - *v2 = vget_f16m1x8_f16m1(_tmp, 2); - *v3 = vget_f16m1x8_f16m1(_tmp, 3); - *v4 = vget_f16m1x8_f16m1(_tmp, 4); - *v5 = vget_f16m1x8_f16m1(_tmp, 5); - *v6 = vget_f16m1x8_f16m1(_tmp, 6); - *v7 = vget_f16m1x8_f16m1(_tmp, 7); -} - -static inline void vlseg4e16_v_f16m1(vfloat16m1_t* v0, vfloat16m1_t* v1, vfloat16m1_t* v2, vfloat16m1_t* v3, const float16_t* base, size_t vl) -{ - vfloat16m1x4_t _tmp = vlseg4e16_v_f16m1x4(base, vl); - *v0 = vget_f16m1x4_f16m1(_tmp, 0); - *v1 = vget_f16m1x4_f16m1(_tmp, 1); - *v2 = vget_f16m1x4_f16m1(_tmp, 2); - *v3 = vget_f16m1x4_f16m1(_tmp, 3); -} - -static inline void vlseg2e16_v_f16m1(vfloat16m1_t* v0, vfloat16m1_t* v1, const float16_t* base, size_t vl) -{ - vfloat16m1x2_t _tmp = vlseg2e16_v_f16m1x2(base, vl); - *v0 = vget_f16m1x2_f16m1(_tmp, 0); - *v1 = vget_f16m1x2_f16m1(_tmp, 1); -} - -// f16m2, vlseg.v, 4 -static inline void vlseg4e16_v_f16m2(vfloat16m2_t* v0, vfloat16m2_t* v1, vfloat16m2_t* v2, vfloat16m2_t* v3, const float16_t* base, size_t vl) -{ - vfloat16m2x4_t _tmp = vlseg4e16_v_f16m2x4(base, vl); - *v0 = vget_f16m2x4_f16m2(_tmp, 0); - *v1 = vget_f16m2x4_f16m2(_tmp, 1); - *v2 = vget_f16m2x4_f16m2(_tmp, 2); - *v3 = vget_f16m2x4_f16m2(_tmp, 3); -} - -// f16m4, vlseg.v, 2 -static inline void vlseg2e16_v_f16m4(vfloat16m4_t* v0, vfloat16m4_t* v1, const float16_t* base, size_t vl) -{ - vfloat16m4x2_t _tmp = vlseg2e16_v_f16m4x2(base, vl); - *v0 = vget_f16m4x2_f16m4(_tmp, 0); - *v1 = vget_f16m4x2_f16m4(_tmp, 1); -} - -#endif // __riscv_zfh -#endif // __riscv_vector - -#ifdef __riscv_vector +#endif // __riscv_zvfh static inline void transpose8x8_ps(vfloat32m1_t& _r0l, vfloat32m1_t& _r0h, vfloat32m1_t& _r1l, vfloat32m1_t& _r1h, @@ -411,36 +134,39 @@ static inline void transpose8x8_ps(vfloat32m1_t& _r0l, vfloat32m1_t& _r0h, vfloat32m1_t& _r7l, vfloat32m1_t& _r7h, size_t vl) { float tmp[64]; - vsseg8e32_v_f32m1(&tmp[0], _r0l, _r1l, _r2l, _r3l, _r4l, _r5l, _r6l, _r7l, vl); - vsseg8e32_v_f32m1(&tmp[32], _r0h, _r1h, _r2h, _r3h, _r4h, _r5h, _r6h, _r7h, vl); + vfloat32m1x8_t _rl = __riscv_vcreate_v_f32m1x8(_r0l, _r1l, _r2l, _r3l, _r4l, _r5l, _r6l, _r7l); + vfloat32m1x8_t _rh = __riscv_vcreate_v_f32m1x8(_r0h, _r1h, _r2h, _r3h, _r4h, _r5h, _r6h, _r7h); + __riscv_vsseg8e32_v_f32m1x8(&tmp[0], _rl, vl); + __riscv_vsseg8e32_v_f32m1x8(&tmp[32], _rh, vl); float* ptr = (float*)tmp; - _r0l = vle32_v_f32m1(ptr + 0 * 4, vl); - _r0h = vle32_v_f32m1(ptr + 1 * 4, vl); - _r1l = vle32_v_f32m1(ptr + 2 * 4, vl); - _r1h = vle32_v_f32m1(ptr + 3 * 4, vl); - _r2l = vle32_v_f32m1(ptr + 4 * 4, vl); - _r2h = vle32_v_f32m1(ptr + 5 * 4, vl); - _r3l = vle32_v_f32m1(ptr + 6 * 4, vl); - _r3h = vle32_v_f32m1(ptr + 7 * 4, vl); - _r4l = vle32_v_f32m1(ptr + 8 * 4, vl); - _r4h = vle32_v_f32m1(ptr + 9 * 4, vl); - _r5l = vle32_v_f32m1(ptr + 10 * 4, vl); - _r5h = vle32_v_f32m1(ptr + 11 * 4, vl); - _r6l = vle32_v_f32m1(ptr + 12 * 4, vl); - _r6h = vle32_v_f32m1(ptr + 13 * 4, vl); - _r7l = vle32_v_f32m1(ptr + 14 * 4, vl); - _r7h = vle32_v_f32m1(ptr + 15 * 4, vl); + _r0l = __riscv_vle32_v_f32m1(ptr + 0 * 4, vl); + _r0h = __riscv_vle32_v_f32m1(ptr + 1 * 4, vl); + _r1l = __riscv_vle32_v_f32m1(ptr + 2 * 4, vl); + _r1h = __riscv_vle32_v_f32m1(ptr + 3 * 4, vl); + _r2l = __riscv_vle32_v_f32m1(ptr + 4 * 4, vl); + _r2h = __riscv_vle32_v_f32m1(ptr + 5 * 4, vl); + _r3l = __riscv_vle32_v_f32m1(ptr + 6 * 4, vl); + _r3h = __riscv_vle32_v_f32m1(ptr + 7 * 4, vl); + _r4l = __riscv_vle32_v_f32m1(ptr + 8 * 4, vl); + _r4h = __riscv_vle32_v_f32m1(ptr + 9 * 4, vl); + _r5l = __riscv_vle32_v_f32m1(ptr + 10 * 4, vl); + _r5h = __riscv_vle32_v_f32m1(ptr + 11 * 4, vl); + _r6l = __riscv_vle32_v_f32m1(ptr + 12 * 4, vl); + _r6h = __riscv_vle32_v_f32m1(ptr + 13 * 4, vl); + _r7l = __riscv_vle32_v_f32m1(ptr + 14 * 4, vl); + _r7h = __riscv_vle32_v_f32m1(ptr + 15 * 4, vl); } static inline void transpose4x4_ps(vfloat32m1_t& _r0, vfloat32m1_t& _r1, vfloat32m1_t& _r2, vfloat32m1_t& _r3, size_t vl) { float tmp[16]; - vsseg4e32_v_f32m1(&tmp[0], _r0, _r1, _r2, _r3, vl); + vfloat32m1x4_t _r = __riscv_vcreate_v_f32m1x4(_r0, _r1, _r2, _r3); + __riscv_vsseg4e32_v_f32m1x4(&tmp[0], _r, vl); float* ptr = (float*)tmp; - _r0 = vle32_v_f32m1(ptr + 0 * 4, vl); - _r1 = vle32_v_f32m1(ptr + 1 * 4, vl); - _r2 = vle32_v_f32m1(ptr + 2 * 4, vl); - _r3 = vle32_v_f32m1(ptr + 3 * 4, vl); + _r0 = __riscv_vle32_v_f32m1(ptr + 0 * 4, vl); + _r1 = __riscv_vle32_v_f32m1(ptr + 1 * 4, vl); + _r2 = __riscv_vle32_v_f32m1(ptr + 2 * 4, vl); + _r3 = __riscv_vle32_v_f32m1(ptr + 3 * 4, vl); } static inline void transpose8x12_ps(vfloat32m1_t& _r0l, vfloat32m1_t& _r0h, @@ -457,56 +183,55 @@ static inline void transpose8x12_ps(vfloat32m1_t& _r0l, vfloat32m1_t& _r0h, vfloat32m1_t& _rbl, vfloat32m1_t& _rbh, size_t vl) { float tmp[8][12]; - - vsse32_v_f32m1(&tmp[0][0], sizeof(float) * 12, _r0l, vl); - vsse32_v_f32m1(&tmp[4][0], sizeof(float) * 12, _r0h, vl); - vsse32_v_f32m1(&tmp[0][1], sizeof(float) * 12, _r1l, vl); - vsse32_v_f32m1(&tmp[4][1], sizeof(float) * 12, _r1h, vl); - vsse32_v_f32m1(&tmp[0][2], sizeof(float) * 12, _r2l, vl); - vsse32_v_f32m1(&tmp[4][2], sizeof(float) * 12, _r2h, vl); - vsse32_v_f32m1(&tmp[0][3], sizeof(float) * 12, _r3l, vl); - vsse32_v_f32m1(&tmp[4][3], sizeof(float) * 12, _r3h, vl); - vsse32_v_f32m1(&tmp[0][4], sizeof(float) * 12, _r4l, vl); - vsse32_v_f32m1(&tmp[4][4], sizeof(float) * 12, _r4h, vl); - vsse32_v_f32m1(&tmp[0][5], sizeof(float) * 12, _r5l, vl); - vsse32_v_f32m1(&tmp[4][5], sizeof(float) * 12, _r5h, vl); - vsse32_v_f32m1(&tmp[0][6], sizeof(float) * 12, _r6l, vl); - vsse32_v_f32m1(&tmp[4][6], sizeof(float) * 12, _r6h, vl); - vsse32_v_f32m1(&tmp[0][7], sizeof(float) * 12, _r7l, vl); - vsse32_v_f32m1(&tmp[4][7], sizeof(float) * 12, _r7h, vl); - vsse32_v_f32m1(&tmp[0][8], sizeof(float) * 12, _r8l, vl); - vsse32_v_f32m1(&tmp[4][8], sizeof(float) * 12, _r8h, vl); - vsse32_v_f32m1(&tmp[0][9], sizeof(float) * 12, _r9l, vl); - vsse32_v_f32m1(&tmp[4][9], sizeof(float) * 12, _r9h, vl); - vsse32_v_f32m1(&tmp[0][10], sizeof(float) * 12, _ral, vl); - vsse32_v_f32m1(&tmp[4][10], sizeof(float) * 12, _rah, vl); - vsse32_v_f32m1(&tmp[0][11], sizeof(float) * 12, _rbl, vl); - vsse32_v_f32m1(&tmp[4][11], sizeof(float) * 12, _rbh, vl); + __riscv_vsse32_v_f32m1(&tmp[0][0], sizeof(float) * 12, _r0l, vl); + __riscv_vsse32_v_f32m1(&tmp[4][0], sizeof(float) * 12, _r0h, vl); + __riscv_vsse32_v_f32m1(&tmp[0][1], sizeof(float) * 12, _r1l, vl); + __riscv_vsse32_v_f32m1(&tmp[4][1], sizeof(float) * 12, _r1h, vl); + __riscv_vsse32_v_f32m1(&tmp[0][2], sizeof(float) * 12, _r2l, vl); + __riscv_vsse32_v_f32m1(&tmp[4][2], sizeof(float) * 12, _r2h, vl); + __riscv_vsse32_v_f32m1(&tmp[0][3], sizeof(float) * 12, _r3l, vl); + __riscv_vsse32_v_f32m1(&tmp[4][3], sizeof(float) * 12, _r3h, vl); + __riscv_vsse32_v_f32m1(&tmp[0][4], sizeof(float) * 12, _r4l, vl); + __riscv_vsse32_v_f32m1(&tmp[4][4], sizeof(float) * 12, _r4h, vl); + __riscv_vsse32_v_f32m1(&tmp[0][5], sizeof(float) * 12, _r5l, vl); + __riscv_vsse32_v_f32m1(&tmp[4][5], sizeof(float) * 12, _r5h, vl); + __riscv_vsse32_v_f32m1(&tmp[0][6], sizeof(float) * 12, _r6l, vl); + __riscv_vsse32_v_f32m1(&tmp[4][6], sizeof(float) * 12, _r6h, vl); + __riscv_vsse32_v_f32m1(&tmp[0][7], sizeof(float) * 12, _r7l, vl); + __riscv_vsse32_v_f32m1(&tmp[4][7], sizeof(float) * 12, _r7h, vl); + __riscv_vsse32_v_f32m1(&tmp[0][8], sizeof(float) * 12, _r8l, vl); + __riscv_vsse32_v_f32m1(&tmp[4][8], sizeof(float) * 12, _r8h, vl); + __riscv_vsse32_v_f32m1(&tmp[0][9], sizeof(float) * 12, _r9l, vl); + __riscv_vsse32_v_f32m1(&tmp[4][9], sizeof(float) * 12, _r9h, vl); + __riscv_vsse32_v_f32m1(&tmp[0][10], sizeof(float) * 12, _ral, vl); + __riscv_vsse32_v_f32m1(&tmp[4][10], sizeof(float) * 12, _rah, vl); + __riscv_vsse32_v_f32m1(&tmp[0][11], sizeof(float) * 12, _rbl, vl); + __riscv_vsse32_v_f32m1(&tmp[4][11], sizeof(float) * 12, _rbh, vl); float* ptr = (float*)tmp; - _r0l = vle32_v_f32m1(ptr + 0 * 4, vl); - _r0h = vle32_v_f32m1(ptr + 1 * 4, vl); - _r1l = vle32_v_f32m1(ptr + 2 * 4, vl); - _r1h = vle32_v_f32m1(ptr + 3 * 4, vl); - _r2l = vle32_v_f32m1(ptr + 4 * 4, vl); - _r2h = vle32_v_f32m1(ptr + 5 * 4, vl); - _r3l = vle32_v_f32m1(ptr + 6 * 4, vl); - _r3h = vle32_v_f32m1(ptr + 7 * 4, vl); - _r4l = vle32_v_f32m1(ptr + 8 * 4, vl); - _r4h = vle32_v_f32m1(ptr + 9 * 4, vl); - _r5l = vle32_v_f32m1(ptr + 10 * 4, vl); - _r5h = vle32_v_f32m1(ptr + 11 * 4, vl); - _r6l = vle32_v_f32m1(ptr + 12 * 4, vl); - _r6h = vle32_v_f32m1(ptr + 13 * 4, vl); - _r7l = vle32_v_f32m1(ptr + 14 * 4, vl); - _r7h = vle32_v_f32m1(ptr + 15 * 4, vl); - _r8l = vle32_v_f32m1(ptr + 16 * 4, vl); - _r8h = vle32_v_f32m1(ptr + 17 * 4, vl); - _r9l = vle32_v_f32m1(ptr + 18 * 4, vl); - _r9h = vle32_v_f32m1(ptr + 19 * 4, vl); - _ral = vle32_v_f32m1(ptr + 20 * 4, vl); - _rah = vle32_v_f32m1(ptr + 21 * 4, vl); - _rbl = vle32_v_f32m1(ptr + 22 * 4, vl); - _rbh = vle32_v_f32m1(ptr + 23 * 4, vl); + _r0l = __riscv_vle32_v_f32m1(ptr + 0 * 4, vl); + _r0h = __riscv_vle32_v_f32m1(ptr + 1 * 4, vl); + _r1l = __riscv_vle32_v_f32m1(ptr + 2 * 4, vl); + _r1h = __riscv_vle32_v_f32m1(ptr + 3 * 4, vl); + _r2l = __riscv_vle32_v_f32m1(ptr + 4 * 4, vl); + _r2h = __riscv_vle32_v_f32m1(ptr + 5 * 4, vl); + _r3l = __riscv_vle32_v_f32m1(ptr + 6 * 4, vl); + _r3h = __riscv_vle32_v_f32m1(ptr + 7 * 4, vl); + _r4l = __riscv_vle32_v_f32m1(ptr + 8 * 4, vl); + _r4h = __riscv_vle32_v_f32m1(ptr + 9 * 4, vl); + _r5l = __riscv_vle32_v_f32m1(ptr + 10 * 4, vl); + _r5h = __riscv_vle32_v_f32m1(ptr + 11 * 4, vl); + _r6l = __riscv_vle32_v_f32m1(ptr + 12 * 4, vl); + _r6h = __riscv_vle32_v_f32m1(ptr + 13 * 4, vl); + _r7l = __riscv_vle32_v_f32m1(ptr + 14 * 4, vl); + _r7h = __riscv_vle32_v_f32m1(ptr + 15 * 4, vl); + _r8l = __riscv_vle32_v_f32m1(ptr + 16 * 4, vl); + _r8h = __riscv_vle32_v_f32m1(ptr + 17 * 4, vl); + _r9l = __riscv_vle32_v_f32m1(ptr + 18 * 4, vl); + _r9h = __riscv_vle32_v_f32m1(ptr + 19 * 4, vl); + _ral = __riscv_vle32_v_f32m1(ptr + 20 * 4, vl); + _rah = __riscv_vle32_v_f32m1(ptr + 21 * 4, vl); + _rbl = __riscv_vle32_v_f32m1(ptr + 22 * 4, vl); + _rbh = __riscv_vle32_v_f32m1(ptr + 23 * 4, vl); } static inline void transpose12x8_ps(vfloat32m1_t& _r0l, vfloat32m1_t& _r0m, vfloat32m1_t& _r0h, @@ -519,81 +244,83 @@ static inline void transpose12x8_ps(vfloat32m1_t& _r0l, vfloat32m1_t& _r0m, vflo vfloat32m1_t& _r7l, vfloat32m1_t& _r7m, vfloat32m1_t& _r7h, size_t vl) { float tmp[96]; - vsseg8e32_v_f32m1(&tmp[0], _r0l, _r1l, _r2l, _r3l, _r4l, _r5l, _r6l, _r7l, vl); - vsseg8e32_v_f32m1(&tmp[32], _r0m, _r1m, _r2m, _r3m, _r4m, _r5m, _r6m, _r7m, vl); - vsseg8e32_v_f32m1(&tmp[64], _r0h, _r1h, _r2h, _r3h, _r4h, _r5h, _r6h, _r7h, vl); - + vfloat32m1x8_t _rl = __riscv_vcreate_v_f32m1x8(_r0l, _r1l, _r2l, _r3l, _r4l, _r5l, _r6l, _r7l); + vfloat32m1x8_t _rm = __riscv_vcreate_v_f32m1x8(_r0m, _r1m, _r2m, _r3m, _r4m, _r5m, _r6m, _r7m); + vfloat32m1x8_t _rh = __riscv_vcreate_v_f32m1x8(_r0h, _r1h, _r2h, _r3h, _r4h, _r5h, _r6h, _r7h); + __riscv_vsseg8e32_v_f32m1x8(&tmp[0], _rl, vl); + __riscv_vsseg8e32_v_f32m1x8(&tmp[32], _rm, vl); + __riscv_vsseg8e32_v_f32m1x8(&tmp[64], _rh, vl); float* ptr = (float*)tmp; - _r0l = vle32_v_f32m1(ptr + 0 * 4, vl); - _r0m = vle32_v_f32m1(ptr + 1 * 4, vl); - _r0h = vle32_v_f32m1(ptr + 2 * 4, vl); - _r1l = vle32_v_f32m1(ptr + 3 * 4, vl); - _r1m = vle32_v_f32m1(ptr + 4 * 4, vl); - _r1h = vle32_v_f32m1(ptr + 5 * 4, vl); - _r2l = vle32_v_f32m1(ptr + 6 * 4, vl); - _r2m = vle32_v_f32m1(ptr + 7 * 4, vl); - _r2h = vle32_v_f32m1(ptr + 8 * 4, vl); - _r3l = vle32_v_f32m1(ptr + 9 * 4, vl); - _r3m = vle32_v_f32m1(ptr + 10 * 4, vl); - _r3h = vle32_v_f32m1(ptr + 11 * 4, vl); - _r4l = vle32_v_f32m1(ptr + 12 * 4, vl); - _r4m = vle32_v_f32m1(ptr + 13 * 4, vl); - _r4h = vle32_v_f32m1(ptr + 14 * 4, vl); - _r5l = vle32_v_f32m1(ptr + 15 * 4, vl); - _r5m = vle32_v_f32m1(ptr + 16 * 4, vl); - _r5h = vle32_v_f32m1(ptr + 17 * 4, vl); - _r6l = vle32_v_f32m1(ptr + 18 * 4, vl); - _r6m = vle32_v_f32m1(ptr + 19 * 4, vl); - _r6h = vle32_v_f32m1(ptr + 20 * 4, vl); - _r7l = vle32_v_f32m1(ptr + 21 * 4, vl); - _r7m = vle32_v_f32m1(ptr + 22 * 4, vl); - _r7h = vle32_v_f32m1(ptr + 23 * 4, vl); + _r0l = __riscv_vle32_v_f32m1(ptr + 0 * 4, vl); + _r0m = __riscv_vle32_v_f32m1(ptr + 1 * 4, vl); + _r0h = __riscv_vle32_v_f32m1(ptr + 2 * 4, vl); + _r1l = __riscv_vle32_v_f32m1(ptr + 3 * 4, vl); + _r1m = __riscv_vle32_v_f32m1(ptr + 4 * 4, vl); + _r1h = __riscv_vle32_v_f32m1(ptr + 5 * 4, vl); + _r2l = __riscv_vle32_v_f32m1(ptr + 6 * 4, vl); + _r2m = __riscv_vle32_v_f32m1(ptr + 7 * 4, vl); + _r2h = __riscv_vle32_v_f32m1(ptr + 8 * 4, vl); + _r3l = __riscv_vle32_v_f32m1(ptr + 9 * 4, vl); + _r3m = __riscv_vle32_v_f32m1(ptr + 10 * 4, vl); + _r3h = __riscv_vle32_v_f32m1(ptr + 11 * 4, vl); + _r4l = __riscv_vle32_v_f32m1(ptr + 12 * 4, vl); + _r4m = __riscv_vle32_v_f32m1(ptr + 13 * 4, vl); + _r4h = __riscv_vle32_v_f32m1(ptr + 14 * 4, vl); + _r5l = __riscv_vle32_v_f32m1(ptr + 15 * 4, vl); + _r5m = __riscv_vle32_v_f32m1(ptr + 16 * 4, vl); + _r5h = __riscv_vle32_v_f32m1(ptr + 17 * 4, vl); + _r6l = __riscv_vle32_v_f32m1(ptr + 18 * 4, vl); + _r6m = __riscv_vle32_v_f32m1(ptr + 19 * 4, vl); + _r6h = __riscv_vle32_v_f32m1(ptr + 20 * 4, vl); + _r7l = __riscv_vle32_v_f32m1(ptr + 21 * 4, vl); + _r7m = __riscv_vle32_v_f32m1(ptr + 22 * 4, vl); + _r7h = __riscv_vle32_v_f32m1(ptr + 23 * 4, vl); } static inline void transpose4x8_ps(vfloat32m1_t& _r0, vfloat32m1_t& _r1, vfloat32m1_t& _r2, vfloat32m1_t& _r3, vfloat32m1_t& _r4, vfloat32m1_t& _r5, vfloat32m1_t& _r6, vfloat32m1_t& _r7, size_t vl) { float tmp[32]; - vsseg8e32_v_f32m1(&tmp[0], _r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7, vl); - + vfloat32m1x8_t _r = __riscv_vcreate_v_f32m1x8(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7); + __riscv_vsseg8e32_v_f32m1x8(&tmp[0], _r, vl); float* ptr = (float*)tmp; - _r0 = vle32_v_f32m1(ptr + 0 * 4, vl); - _r1 = vle32_v_f32m1(ptr + 1 * 4, vl); - _r2 = vle32_v_f32m1(ptr + 2 * 4, vl); - _r3 = vle32_v_f32m1(ptr + 3 * 4, vl); - _r4 = vle32_v_f32m1(ptr + 4 * 4, vl); - _r5 = vle32_v_f32m1(ptr + 5 * 4, vl); - _r6 = vle32_v_f32m1(ptr + 6 * 4, vl); - _r7 = vle32_v_f32m1(ptr + 7 * 4, vl); + _r0 = __riscv_vle32_v_f32m1(ptr + 0 * 4, vl); + _r1 = __riscv_vle32_v_f32m1(ptr + 1 * 4, vl); + _r2 = __riscv_vle32_v_f32m1(ptr + 2 * 4, vl); + _r3 = __riscv_vle32_v_f32m1(ptr + 3 * 4, vl); + _r4 = __riscv_vle32_v_f32m1(ptr + 4 * 4, vl); + _r5 = __riscv_vle32_v_f32m1(ptr + 5 * 4, vl); + _r6 = __riscv_vle32_v_f32m1(ptr + 6 * 4, vl); + _r7 = __riscv_vle32_v_f32m1(ptr + 7 * 4, vl); } static inline void transpose4x12_ps(vfloat32m1_t& _r0, vfloat32m1_t& _r1, vfloat32m1_t& _r2, vfloat32m1_t& _r3, vfloat32m1_t& _r4, vfloat32m1_t& _r5, vfloat32m1_t& _r6, vfloat32m1_t& _r7, vfloat32m1_t& _r8, vfloat32m1_t& _r9, vfloat32m1_t& _ra, vfloat32m1_t& _rb, size_t vl) { float tmp[4][12]; - vsse32_v_f32m1(&tmp[0][0], sizeof(float) * 12, _r0, vl); - vsse32_v_f32m1(&tmp[0][1], sizeof(float) * 12, _r1, vl); - vsse32_v_f32m1(&tmp[0][2], sizeof(float) * 12, _r2, vl); - vsse32_v_f32m1(&tmp[0][3], sizeof(float) * 12, _r3, vl); - vsse32_v_f32m1(&tmp[0][4], sizeof(float) * 12, _r4, vl); - vsse32_v_f32m1(&tmp[0][5], sizeof(float) * 12, _r5, vl); - vsse32_v_f32m1(&tmp[0][6], sizeof(float) * 12, _r6, vl); - vsse32_v_f32m1(&tmp[0][7], sizeof(float) * 12, _r7, vl); - vsse32_v_f32m1(&tmp[0][8], sizeof(float) * 12, _r8, vl); - vsse32_v_f32m1(&tmp[0][9], sizeof(float) * 12, _r9, vl); - vsse32_v_f32m1(&tmp[0][10], sizeof(float) * 12, _ra, vl); - vsse32_v_f32m1(&tmp[0][11], sizeof(float) * 12, _rb, vl); + __riscv_vsse32_v_f32m1(&tmp[0][0], sizeof(float) * 12, _r0, vl); + __riscv_vsse32_v_f32m1(&tmp[0][1], sizeof(float) * 12, _r1, vl); + __riscv_vsse32_v_f32m1(&tmp[0][2], sizeof(float) * 12, _r2, vl); + __riscv_vsse32_v_f32m1(&tmp[0][3], sizeof(float) * 12, _r3, vl); + __riscv_vsse32_v_f32m1(&tmp[0][4], sizeof(float) * 12, _r4, vl); + __riscv_vsse32_v_f32m1(&tmp[0][5], sizeof(float) * 12, _r5, vl); + __riscv_vsse32_v_f32m1(&tmp[0][6], sizeof(float) * 12, _r6, vl); + __riscv_vsse32_v_f32m1(&tmp[0][7], sizeof(float) * 12, _r7, vl); + __riscv_vsse32_v_f32m1(&tmp[0][8], sizeof(float) * 12, _r8, vl); + __riscv_vsse32_v_f32m1(&tmp[0][9], sizeof(float) * 12, _r9, vl); + __riscv_vsse32_v_f32m1(&tmp[0][10], sizeof(float) * 12, _ra, vl); + __riscv_vsse32_v_f32m1(&tmp[0][11], sizeof(float) * 12, _rb, vl); float* ptr = (float*)tmp; - _r0 = vle32_v_f32m1(ptr + 0 * 4, vl); - _r1 = vle32_v_f32m1(ptr + 1 * 4, vl); - _r2 = vle32_v_f32m1(ptr + 2 * 4, vl); - _r3 = vle32_v_f32m1(ptr + 3 * 4, vl); - _r4 = vle32_v_f32m1(ptr + 4 * 4, vl); - _r5 = vle32_v_f32m1(ptr + 5 * 4, vl); - _r6 = vle32_v_f32m1(ptr + 6 * 4, vl); - _r7 = vle32_v_f32m1(ptr + 7 * 4, vl); - _r8 = vle32_v_f32m1(ptr + 8 * 4, vl); - _r9 = vle32_v_f32m1(ptr + 9 * 4, vl); - _ra = vle32_v_f32m1(ptr + 10 * 4, vl); - _rb = vle32_v_f32m1(ptr + 11 * 4, vl); + _r0 = __riscv_vle32_v_f32m1(ptr + 0 * 4, vl); + _r1 = __riscv_vle32_v_f32m1(ptr + 1 * 4, vl); + _r2 = __riscv_vle32_v_f32m1(ptr + 2 * 4, vl); + _r3 = __riscv_vle32_v_f32m1(ptr + 3 * 4, vl); + _r4 = __riscv_vle32_v_f32m1(ptr + 4 * 4, vl); + _r5 = __riscv_vle32_v_f32m1(ptr + 5 * 4, vl); + _r6 = __riscv_vle32_v_f32m1(ptr + 6 * 4, vl); + _r7 = __riscv_vle32_v_f32m1(ptr + 7 * 4, vl); + _r8 = __riscv_vle32_v_f32m1(ptr + 8 * 4, vl); + _r9 = __riscv_vle32_v_f32m1(ptr + 9 * 4, vl); + _ra = __riscv_vle32_v_f32m1(ptr + 10 * 4, vl); + _rb = __riscv_vle32_v_f32m1(ptr + 11 * 4, vl); } static inline void transpose8x4_ps(vfloat32m1_t& _r0l, vfloat32m1_t& _r0h, @@ -602,17 +329,19 @@ static inline void transpose8x4_ps(vfloat32m1_t& _r0l, vfloat32m1_t& _r0h, vfloat32m1_t& _r3l, vfloat32m1_t& _r3h, size_t vl) { float tmp[32]; - vsseg4e32_v_f32m1(&tmp[0], _r0l, _r1l, _r2l, _r3l, vl); - vsseg4e32_v_f32m1(&tmp[16], _r0h, _r1h, _r2h, _r3h, vl); + vfloat32m1x4_t _rl = __riscv_vcreate_v_f32m1x4(_r0l, _r1l, _r2l, _r3l); + vfloat32m1x4_t _rh = __riscv_vcreate_v_f32m1x4(_r0h, _r1h, _r2h, _r3h); + __riscv_vsseg4e32_v_f32m1x4(&tmp[0], _rl, vl); + __riscv_vsseg4e32_v_f32m1x4(&tmp[16], _rh, vl); float* ptr = (float*)tmp; - _r0l = vle32_v_f32m1(ptr + 0 * 4, vl); - _r0h = vle32_v_f32m1(ptr + 1 * 4, vl); - _r1l = vle32_v_f32m1(ptr + 2 * 4, vl); - _r1h = vle32_v_f32m1(ptr + 3 * 4, vl); - _r2l = vle32_v_f32m1(ptr + 4 * 4, vl); - _r2h = vle32_v_f32m1(ptr + 5 * 4, vl); - _r3l = vle32_v_f32m1(ptr + 6 * 4, vl); - _r3h = vle32_v_f32m1(ptr + 7 * 4, vl); + _r0l = __riscv_vle32_v_f32m1(ptr + 0 * 4, vl); + _r0h = __riscv_vle32_v_f32m1(ptr + 1 * 4, vl); + _r1l = __riscv_vle32_v_f32m1(ptr + 2 * 4, vl); + _r1h = __riscv_vle32_v_f32m1(ptr + 3 * 4, vl); + _r2l = __riscv_vle32_v_f32m1(ptr + 4 * 4, vl); + _r2h = __riscv_vle32_v_f32m1(ptr + 5 * 4, vl); + _r3l = __riscv_vle32_v_f32m1(ptr + 6 * 4, vl); + _r3h = __riscv_vle32_v_f32m1(ptr + 7 * 4, vl); } #endif diff --git a/src/layer/riscv/rvv_mathfun.h b/src/layer/riscv/rvv_mathfun.h index 2ec10bae48a..c1da3b40f45 100644 --- a/src/layer/riscv/rvv_mathfun.h +++ b/src/layer/riscv/rvv_mathfun.h @@ -31,71 +31,71 @@ #define c_cephes_log_q1 -2.12194440e-4 #define c_cephes_log_q2 0.693359375 -#define _RVV_FLOAT32_LOG_OP(LMUL, MLEN) \ - static inline vfloat32m##LMUL##_t log_ps(vfloat32m##LMUL##_t x, size_t vl) \ - { \ - x = vfmax_vf_f32m##LMUL(x, 0.f, vl); /* force flush to zero on denormal values */ \ - vbool##MLEN##_t invalid_mask = vmfle_vf_f32m##LMUL##_b##MLEN(x, 0.f, vl); \ - \ - vint32m##LMUL##_t ux = vreinterpret_v_f32m##LMUL##_i32m##LMUL(x); \ - \ - vint32m##LMUL##_t emm0 = vsra_vx_i32m##LMUL(ux, 23, vl); \ - \ - /* keep only the fractional part */ \ - ux = vand_vx_i32m##LMUL(ux, c_inv_mant_mask, vl); \ - ux = vor_vx_i32m##LMUL(ux, 1056964608 /* reinterpret_cast(0.5) */, vl); \ - x = vreinterpret_v_i32m##LMUL##_f32m##LMUL(ux); \ - \ - emm0 = vsub_vx_i32m##LMUL(emm0, 0x7f, vl); \ - vfloat32m##LMUL##_t e = vfcvt_f_x_v_f32m##LMUL(emm0, vl); \ - \ - e = vfadd_vf_f32m##LMUL(e, 1.f, vl); \ - \ - /* part2: */ \ - /* if( x < SQRTHF ) { */ \ - /* e -= 1; */ \ - /* x = x + x - 1.0; */ \ - /* } else { x = x - 1.0; } */ \ - vbool##MLEN##_t mask = vmflt_vf_f32m##LMUL##_b##MLEN(x, c_cephes_SQRTHF, vl); \ - x = vfadd_vv_f32m##LMUL##_m(mask, x, x, x, vl); \ - x = vfsub_vf_f32m##LMUL(x, 1.f, vl); \ - e = vfsub_vf_f32m##LMUL##_m(mask, e, e, 1.f, vl); \ - \ - vfloat32m##LMUL##_t z = vfmul_vv_f32m##LMUL(x, x, vl); \ - \ - vfloat32m##LMUL##_t y = vfmul_vf_f32m##LMUL(x, c_cephes_log_p0, vl); \ - y = vfadd_vf_f32m##LMUL(y, c_cephes_log_p1, vl); \ - y = vfmul_vv_f32m##LMUL(y, x, vl); \ - y = vfadd_vf_f32m##LMUL(y, c_cephes_log_p2, vl); \ - y = vfmul_vv_f32m##LMUL(y, x, vl); \ - y = vfadd_vf_f32m##LMUL(y, c_cephes_log_p3, vl); \ - y = vfmul_vv_f32m##LMUL(y, x, vl); \ - y = vfadd_vf_f32m##LMUL(y, c_cephes_log_p4, vl); \ - y = vfmul_vv_f32m##LMUL(y, x, vl); \ - y = vfadd_vf_f32m##LMUL(y, c_cephes_log_p5, vl); \ - y = vfmul_vv_f32m##LMUL(y, x, vl); \ - y = vfadd_vf_f32m##LMUL(y, c_cephes_log_p6, vl); \ - y = vfmul_vv_f32m##LMUL(y, x, vl); \ - y = vfadd_vf_f32m##LMUL(y, c_cephes_log_p7, vl); \ - y = vfmul_vv_f32m##LMUL(y, x, vl); \ - y = vfadd_vf_f32m##LMUL(y, c_cephes_log_p8, vl); \ - y = vfmul_vv_f32m##LMUL(y, x, vl); \ - \ - y = vfmul_vv_f32m##LMUL(y, z, vl); \ - \ - vfloat32m##LMUL##_t tmp = vfmul_vf_f32m##LMUL(e, c_cephes_log_q1, vl); \ - y = vfadd_vv_f32m##LMUL(y, tmp, vl); \ - \ - tmp = vfmul_vf_f32m##LMUL(z, 0.5f, vl); \ - y = vfsub_vv_f32m##LMUL(y, tmp, vl); \ - \ - tmp = vfmul_vf_f32m##LMUL(e, c_cephes_log_q2, vl); \ - x = vfadd_vv_f32m##LMUL(x, y, vl); \ - x = vfadd_vv_f32m##LMUL(x, tmp, vl); \ - /* negative arg will be NAN */ \ - vuint32m##LMUL##_t xtmp = vreinterpret_v_f32m##LMUL##_u32m##LMUL(x); \ - x = vreinterpret_v_u32m##LMUL##_f32m##LMUL(vor_vx_u32m##LMUL##_m(invalid_mask, xtmp, xtmp, 0xffffffff, vl)); \ - return x; \ +#define _RVV_FLOAT32_LOG_OP(LMUL, MLEN) \ + static inline vfloat32m##LMUL##_t log_ps(vfloat32m##LMUL##_t x, size_t vl) \ + { \ + x = __riscv_vfmax_vf_f32m##LMUL(x, 0.f, vl); /* force flush to zero on denormal values */ \ + vbool##MLEN##_t invalid_mask = __riscv_vmfle_vf_f32m##LMUL##_b##MLEN(x, 0.f, vl); \ + \ + vint32m##LMUL##_t ux = __riscv_vreinterpret_v_f32m##LMUL##_i32m##LMUL(x); \ + \ + vint32m##LMUL##_t emm0 = __riscv_vsra_vx_i32m##LMUL(ux, 23, vl); \ + \ + /* keep only the fractional part */ \ + ux = __riscv_vand_vx_i32m##LMUL(ux, c_inv_mant_mask, vl); \ + ux = __riscv_vor_vx_i32m##LMUL(ux, 1056964608 /* reinterpret_cast(0.5) */, vl); \ + x = __riscv_vreinterpret_v_i32m##LMUL##_f32m##LMUL(ux); \ + \ + emm0 = __riscv_vsub_vx_i32m##LMUL(emm0, 0x7f, vl); \ + vfloat32m##LMUL##_t e = __riscv_vfcvt_f_x_v_f32m##LMUL(emm0, vl); \ + \ + e = __riscv_vfadd_vf_f32m##LMUL(e, 1.f, vl); \ + \ + /* part2: */ \ + /* if( x < SQRTHF ) { */ \ + /* e -= 1; */ \ + /* x = x + x - 1.0; */ \ + /* } else { x = x - 1.0; } */ \ + vbool##MLEN##_t mask = __riscv_vmflt_vf_f32m##LMUL##_b##MLEN(x, c_cephes_SQRTHF, vl); \ + x = __riscv_vfadd_vv_f32m##LMUL##_mu(mask, x, x, x, vl); \ + x = __riscv_vfsub_vf_f32m##LMUL(x, 1.f, vl); \ + e = __riscv_vfsub_vf_f32m##LMUL##_mu(mask, e, e, 1.f, vl); \ + \ + vfloat32m##LMUL##_t z = __riscv_vfmul_vv_f32m##LMUL(x, x, vl); \ + \ + vfloat32m##LMUL##_t y = __riscv_vfmul_vf_f32m##LMUL(x, c_cephes_log_p0, vl); \ + y = __riscv_vfadd_vf_f32m##LMUL(y, c_cephes_log_p1, vl); \ + y = __riscv_vfmul_vv_f32m##LMUL(y, x, vl); \ + y = __riscv_vfadd_vf_f32m##LMUL(y, c_cephes_log_p2, vl); \ + y = __riscv_vfmul_vv_f32m##LMUL(y, x, vl); \ + y = __riscv_vfadd_vf_f32m##LMUL(y, c_cephes_log_p3, vl); \ + y = __riscv_vfmul_vv_f32m##LMUL(y, x, vl); \ + y = __riscv_vfadd_vf_f32m##LMUL(y, c_cephes_log_p4, vl); \ + y = __riscv_vfmul_vv_f32m##LMUL(y, x, vl); \ + y = __riscv_vfadd_vf_f32m##LMUL(y, c_cephes_log_p5, vl); \ + y = __riscv_vfmul_vv_f32m##LMUL(y, x, vl); \ + y = __riscv_vfadd_vf_f32m##LMUL(y, c_cephes_log_p6, vl); \ + y = __riscv_vfmul_vv_f32m##LMUL(y, x, vl); \ + y = __riscv_vfadd_vf_f32m##LMUL(y, c_cephes_log_p7, vl); \ + y = __riscv_vfmul_vv_f32m##LMUL(y, x, vl); \ + y = __riscv_vfadd_vf_f32m##LMUL(y, c_cephes_log_p8, vl); \ + y = __riscv_vfmul_vv_f32m##LMUL(y, x, vl); \ + \ + y = __riscv_vfmul_vv_f32m##LMUL(y, z, vl); \ + \ + vfloat32m##LMUL##_t tmp = __riscv_vfmul_vf_f32m##LMUL(e, c_cephes_log_q1, vl); \ + y = __riscv_vfadd_vv_f32m##LMUL(y, tmp, vl); \ + \ + tmp = __riscv_vfmul_vf_f32m##LMUL(z, 0.5f, vl); \ + y = __riscv_vfsub_vv_f32m##LMUL(y, tmp, vl); \ + \ + tmp = __riscv_vfmul_vf_f32m##LMUL(e, c_cephes_log_q2, vl); \ + x = __riscv_vfadd_vv_f32m##LMUL(x, y, vl); \ + x = __riscv_vfadd_vv_f32m##LMUL(x, tmp, vl); \ + /* negative arg will be NAN */ \ + vuint32m##LMUL##_t xtmp = __riscv_vreinterpret_v_f32m##LMUL##_u32m##LMUL(x); \ + x = __riscv_vreinterpret_v_u32m##LMUL##_f32m##LMUL(__riscv_vor_vx_u32m##LMUL##_mu(invalid_mask, xtmp, xtmp, 0xffffffff, vl)); \ + return x; \ } _RVV_FLOAT32_LOG_OP(1, 32) @@ -117,54 +117,54 @@ _RVV_FLOAT32_LOG_OP(8, 4) #define c_cephes_exp_p4 1.6666665459E-1 #define c_cephes_exp_p5 5.0000001201E-1 -#define _RVV_FLOAT32_EXP_OP(LMUL, MLEN) \ - static inline vfloat32m##LMUL##_t exp_ps(vfloat32m##LMUL##_t x, size_t vl) \ - { \ - vfloat32m##LMUL##_t tmp, fx; \ - \ - x = vfmin_vf_f32m##LMUL(x, c_exp_hi, vl); \ - x = vfmax_vf_f32m##LMUL(x, c_exp_lo, vl); \ - \ - /* express exp(x) as exp(g + n*log(2)) */ \ - fx = vfmacc_vf_f32m##LMUL(vfmv_v_f_f32m##LMUL(0.5f, vl), c_cephes_LOG2EF, x, vl); \ - \ - /* perform a floorf */ \ - tmp = vfcvt_f_x_v_f32m##LMUL(vfcvt_x_f_v_i32m##LMUL(fx, vl), vl); \ - \ - /* if greater, substract 1 */ \ - vbool##MLEN##_t mask = vmfgt_vv_f32m##LMUL##_b##MLEN(tmp, fx, vl); \ - fx = vfsub_vf_f32m##LMUL##_m(mask, tmp, tmp, 1.f, vl); \ - \ - tmp = vfmul_vf_f32m##LMUL(fx, c_cephes_exp_C1, vl); \ - vfloat32m##LMUL##_t z = vfmul_vf_f32m##LMUL(fx, c_cephes_exp_C2, vl); \ - x = vfsub_vv_f32m##LMUL(x, tmp, vl); \ - x = vfsub_vv_f32m##LMUL(x, z, vl); \ - \ - vfloat32m##LMUL##_t y = vfmul_vf_f32m##LMUL(x, c_cephes_exp_p0, vl); \ - z = vfmul_vv_f32m##LMUL(x, x, vl); \ - \ - y = vfadd_vf_f32m##LMUL(y, c_cephes_exp_p1, vl); \ - y = vfmul_vv_f32m##LMUL(y, x, vl); \ - y = vfadd_vf_f32m##LMUL(y, c_cephes_exp_p2, vl); \ - y = vfmul_vv_f32m##LMUL(y, x, vl); \ - y = vfadd_vf_f32m##LMUL(y, c_cephes_exp_p3, vl); \ - y = vfmul_vv_f32m##LMUL(y, x, vl); \ - y = vfadd_vf_f32m##LMUL(y, c_cephes_exp_p4, vl); \ - y = vfmul_vv_f32m##LMUL(y, x, vl); \ - y = vfadd_vf_f32m##LMUL(y, c_cephes_exp_p5, vl); \ - \ - y = vfmul_vv_f32m##LMUL(y, z, vl); \ - y = vfadd_vv_f32m##LMUL(y, x, vl); \ - y = vfadd_vf_f32m##LMUL(y, 1.f, vl); \ - \ - /* build 2^n */ \ - vint32m##LMUL##_t mm = vfcvt_x_f_v_i32m##LMUL(fx, vl); \ - mm = vadd_vx_i32m##LMUL(mm, 0x7f, vl); \ - mm = vsll_vx_i32m##LMUL(mm, 23, vl); \ - vfloat32m##LMUL##_t pow2n = vreinterpret_v_i32m##LMUL##_f32m##LMUL(mm); \ - \ - y = vfmul_vv_f32m##LMUL(y, pow2n, vl); \ - return y; \ +#define _RVV_FLOAT32_EXP_OP(LMUL, MLEN) \ + static inline vfloat32m##LMUL##_t exp_ps(vfloat32m##LMUL##_t x, size_t vl) \ + { \ + vfloat32m##LMUL##_t tmp, fx; \ + \ + x = __riscv_vfmin_vf_f32m##LMUL(x, c_exp_hi, vl); \ + x = __riscv_vfmax_vf_f32m##LMUL(x, c_exp_lo, vl); \ + \ + /* express exp(x) as exp(g + n*log(2)) */ \ + fx = __riscv_vfmacc_vf_f32m##LMUL(__riscv_vfmv_v_f_f32m##LMUL(0.5f, vl), c_cephes_LOG2EF, x, vl); \ + \ + /* perform a floorf */ \ + tmp = __riscv_vfcvt_f_x_v_f32m##LMUL(__riscv_vfcvt_x_f_v_i32m##LMUL(fx, vl), vl); \ + \ + /* if greater, substract 1 */ \ + vbool##MLEN##_t mask = __riscv_vmfgt_vv_f32m##LMUL##_b##MLEN(tmp, fx, vl); \ + fx = __riscv_vfsub_vf_f32m##LMUL##_mu(mask, tmp, tmp, 1.f, vl); \ + \ + tmp = __riscv_vfmul_vf_f32m##LMUL(fx, c_cephes_exp_C1, vl); \ + vfloat32m##LMUL##_t z = __riscv_vfmul_vf_f32m##LMUL(fx, c_cephes_exp_C2, vl); \ + x = __riscv_vfsub_vv_f32m##LMUL(x, tmp, vl); \ + x = __riscv_vfsub_vv_f32m##LMUL(x, z, vl); \ + \ + vfloat32m##LMUL##_t y = __riscv_vfmul_vf_f32m##LMUL(x, c_cephes_exp_p0, vl); \ + z = __riscv_vfmul_vv_f32m##LMUL(x, x, vl); \ + \ + y = __riscv_vfadd_vf_f32m##LMUL(y, c_cephes_exp_p1, vl); \ + y = __riscv_vfmul_vv_f32m##LMUL(y, x, vl); \ + y = __riscv_vfadd_vf_f32m##LMUL(y, c_cephes_exp_p2, vl); \ + y = __riscv_vfmul_vv_f32m##LMUL(y, x, vl); \ + y = __riscv_vfadd_vf_f32m##LMUL(y, c_cephes_exp_p3, vl); \ + y = __riscv_vfmul_vv_f32m##LMUL(y, x, vl); \ + y = __riscv_vfadd_vf_f32m##LMUL(y, c_cephes_exp_p4, vl); \ + y = __riscv_vfmul_vv_f32m##LMUL(y, x, vl); \ + y = __riscv_vfadd_vf_f32m##LMUL(y, c_cephes_exp_p5, vl); \ + \ + y = __riscv_vfmul_vv_f32m##LMUL(y, z, vl); \ + y = __riscv_vfadd_vv_f32m##LMUL(y, x, vl); \ + y = __riscv_vfadd_vf_f32m##LMUL(y, 1.f, vl); \ + \ + /* build 2^n */ \ + vint32m##LMUL##_t mm = __riscv_vfcvt_x_f_v_i32m##LMUL(fx, vl); \ + mm = __riscv_vadd_vx_i32m##LMUL(mm, 0x7f, vl); \ + mm = __riscv_vsll_vx_i32m##LMUL(mm, 23, vl); \ + vfloat32m##LMUL##_t pow2n = __riscv_vreinterpret_v_i32m##LMUL##_f32m##LMUL(mm); \ + \ + y = __riscv_vfmul_vv_f32m##LMUL(y, pow2n, vl); \ + return y; \ } _RVV_FLOAT32_EXP_OP(1, 32) @@ -183,73 +183,73 @@ _RVV_FLOAT32_EXP_OP(8, 4) #define c_coscof_p2 4.166664568298827E-002 #define c_cephes_FOPI 1.27323954473516 // 4 / M_PI -#define _RVV_FLOAT32_SINCOS_OP(LMUL, MLEN) \ - static inline void sincos_ps(vfloat32m##LMUL##_t x, vfloat32m##LMUL##_t* ysin, vfloat32m##LMUL##_t* ycos, size_t vl) \ - { \ - /* any x */ \ - vfloat32m##LMUL##_t xmm1, xmm2, xmm3, y; \ - \ - vuint32m##LMUL##_t emm2; \ - \ - vbool##MLEN##_t sign_mask_sin, sign_mask_cos; \ - sign_mask_sin = vmflt_vf_f32m##LMUL##_b##MLEN(x, 0.f, vl); \ - x = vfsgnj_vf_f32m##LMUL(x, 1.f, vl); \ - \ - /* scale by 4/Pi */ \ - y = vfmul_vf_f32m##LMUL(x, c_cephes_FOPI, vl); \ - \ - /* store the integer part of y in mm0 */ \ - emm2 = vfcvt_xu_f_v_u32m##LMUL(y, vl); \ - /* j=(j+1) & (~1) (see the cephes sources) */ \ - emm2 = vadd_vx_u32m##LMUL(emm2, 1, vl); \ - emm2 = vand_vx_u32m##LMUL(emm2, ~1, vl); \ - y = vfcvt_f_xu_v_f32m##LMUL(emm2, vl); \ - \ - /* get the polynom selection mask */ \ - /* there is one polynom for 0 <= x <= Pi/4 */ \ - /* and another one for Pi/4 tmpx(vl); \ - std::vector tmpy(vl); \ - vse32_v_f32m##LMUL(tmpx.data(), a, vl); \ - vse32_v_f32m##LMUL(tmpy.data(), b, vl); \ - for (size_t i = 0; i < vl; i++) \ - { \ - tmpx[i] = atan2(tmpx[i], tmpy[i]); \ - } \ - return vle32_v_f32m##LMUL(tmpx.data(), vl); \ +#define _RVV_FLOAT32_ATAN2_OP(LMUL, MLEN) \ + static inline vfloat32m##LMUL##_t atan2_ps(vfloat32m##LMUL##_t a, vfloat32m##LMUL##_t b, volatile size_t vl) \ + { \ + std::vector tmpx(vl); \ + std::vector tmpy(vl); \ + __riscv_vse32_v_f32m##LMUL(tmpx.data(), a, vl); \ + __riscv_vse32_v_f32m##LMUL(tmpy.data(), b, vl); \ + for (size_t i = 0; i < vl; i++) \ + { \ + tmpx[i] = atan2(tmpx[i], tmpy[i]); \ + } \ + return __riscv_vle32_v_f32m##LMUL(tmpx.data(), vl); \ } _RVV_FLOAT32_ATAN2_OP(1, 32) diff --git a/src/layer/riscv/rvv_mathfun_fp16s.h b/src/layer/riscv/rvv_mathfun_fp16s.h index 2cf5d08f4f0..99408c4f0da 100644 --- a/src/layer/riscv/rvv_mathfun_fp16s.h +++ b/src/layer/riscv/rvv_mathfun_fp16s.h @@ -31,71 +31,71 @@ #define c_cephes_log_q1 -2.12194440e-4 #define c_cephes_log_q2 0.693359375 -#define _RVV_FLOAT16_LOG_OP(LMUL, MLEN) \ - static inline vfloat16m##LMUL##_t log_ps(vfloat16m##LMUL##_t x, size_t vl) \ - { \ - x = vfmax_vf_f16m##LMUL(x, 0.f, vl); /* force flush to zero on denormal values */ \ - vbool##MLEN##_t invalid_mask = vmfle_vf_f16m##LMUL##_b##MLEN(x, 0.f, vl); \ - \ - vint16m##LMUL##_t ux = vreinterpret_v_f16m##LMUL##_i16m##LMUL(x); \ - \ - vint16m##LMUL##_t emm0 = vsra_vx_i16m##LMUL(ux, 10, vl); \ - \ - /* keep only the fractional part */ \ - ux = vand_vx_i16m##LMUL(ux, c_inv_mant_mask_f16, vl); \ - ux = vor_vx_i16m##LMUL(ux, 14336 /* reinterpret_cast((__fp16)0.5) */, vl); \ - x = vreinterpret_v_i16m##LMUL##_f16m##LMUL(ux); \ - \ - emm0 = vsub_vx_i16m##LMUL(emm0, 0xf, vl); \ - vfloat16m##LMUL##_t e = vfcvt_f_x_v_f16m##LMUL(emm0, vl); \ - \ - e = vfadd_vf_f16m##LMUL(e, 1.f, vl); \ - \ - /* part2: */ \ - /* if( x < SQRTHF ) { */ \ - /* e -= 1; */ \ - /* x = x + x - 1.0; */ \ - /* } else { x = x - 1.0; } */ \ - vbool##MLEN##_t mask = vmflt_vf_f16m##LMUL##_b##MLEN(x, c_cephes_SQRTHF, vl); \ - x = vfadd_vv_f16m##LMUL##_m(mask, x, x, x, vl); \ - x = vfsub_vf_f16m##LMUL(x, 1.f, vl); \ - e = vfsub_vf_f16m##LMUL##_m(mask, e, e, 1.f, vl); \ - \ - vfloat16m##LMUL##_t z = vfmul_vv_f16m##LMUL(x, x, vl); \ - \ - vfloat16m##LMUL##_t y = vfmul_vf_f16m##LMUL(x, c_cephes_log_p0, vl); \ - y = vfadd_vf_f16m##LMUL(y, c_cephes_log_p1, vl); \ - y = vfmul_vv_f16m##LMUL(y, x, vl); \ - y = vfadd_vf_f16m##LMUL(y, c_cephes_log_p2, vl); \ - y = vfmul_vv_f16m##LMUL(y, x, vl); \ - y = vfadd_vf_f16m##LMUL(y, c_cephes_log_p3, vl); \ - y = vfmul_vv_f16m##LMUL(y, x, vl); \ - y = vfadd_vf_f16m##LMUL(y, c_cephes_log_p4, vl); \ - y = vfmul_vv_f16m##LMUL(y, x, vl); \ - y = vfadd_vf_f16m##LMUL(y, c_cephes_log_p5, vl); \ - y = vfmul_vv_f16m##LMUL(y, x, vl); \ - y = vfadd_vf_f16m##LMUL(y, c_cephes_log_p6, vl); \ - y = vfmul_vv_f16m##LMUL(y, x, vl); \ - y = vfadd_vf_f16m##LMUL(y, c_cephes_log_p7, vl); \ - y = vfmul_vv_f16m##LMUL(y, x, vl); \ - y = vfadd_vf_f16m##LMUL(y, c_cephes_log_p8, vl); \ - y = vfmul_vv_f16m##LMUL(y, x, vl); \ - \ - y = vfmul_vv_f16m##LMUL(y, z, vl); \ - \ - vfloat16m##LMUL##_t tmp = vfmul_vf_f16m##LMUL(e, c_cephes_log_q1, vl); \ - y = vfadd_vv_f16m##LMUL(y, tmp, vl); \ - \ - tmp = vfmul_vf_f16m##LMUL(z, 0.5f, vl); \ - y = vfsub_vv_f16m##LMUL(y, tmp, vl); \ - \ - tmp = vfmul_vf_f16m##LMUL(e, c_cephes_log_q2, vl); \ - x = vfadd_vv_f16m##LMUL(x, y, vl); \ - x = vfadd_vv_f16m##LMUL(x, tmp, vl); \ - /* negative arg will be NAN */ \ - vuint16m##LMUL##_t xtmp = vreinterpret_v_f16m##LMUL##_u16m##LMUL(x); \ - x = vreinterpret_v_u16m##LMUL##_f16m##LMUL(vor_vx_u16m##LMUL##_m(invalid_mask, xtmp, xtmp, 0xffff, vl)); \ - return x; \ +#define _RVV_FLOAT16_LOG_OP(LMUL, MLEN) \ + static inline vfloat16m##LMUL##_t log_ps(vfloat16m##LMUL##_t x, size_t vl) \ + { \ + x = __riscv_vfmax_vf_f16m##LMUL(x, (__fp16)0.f, vl); /* force flush to zero on denormal values */ \ + vbool##MLEN##_t invalid_mask = __riscv_vmfle_vf_f16m##LMUL##_b##MLEN(x, (__fp16)0.f, vl); \ + \ + vint16m##LMUL##_t ux = __riscv_vreinterpret_v_f16m##LMUL##_i16m##LMUL(x); \ + \ + vint16m##LMUL##_t emm0 = __riscv_vsra_vx_i16m##LMUL(ux, 10, vl); \ + \ + /* keep only the fractional part */ \ + ux = __riscv_vand_vx_i16m##LMUL(ux, c_inv_mant_mask_f16, vl); \ + ux = __riscv_vor_vx_i16m##LMUL(ux, 14336 /* reinterpret_cast((__fp16)0.5) */, vl); \ + x = __riscv_vreinterpret_v_i16m##LMUL##_f16m##LMUL(ux); \ + \ + emm0 = __riscv_vsub_vx_i16m##LMUL(emm0, 0xf, vl); \ + vfloat16m##LMUL##_t e = __riscv_vfcvt_f_x_v_f16m##LMUL(emm0, vl); \ + \ + e = __riscv_vfadd_vf_f16m##LMUL(e, (__fp16)1.f, vl); \ + \ + /* part2: */ \ + /* if( x < SQRTHF ) { */ \ + /* e -= 1; */ \ + /* x = x + x - 1.0; */ \ + /* } else { x = x - 1.0; } */ \ + vbool##MLEN##_t mask = __riscv_vmflt_vf_f16m##LMUL##_b##MLEN(x, (__fp16)c_cephes_SQRTHF, vl); \ + x = __riscv_vfadd_vv_f16m##LMUL##_mu(mask, x, x, x, vl); \ + x = __riscv_vfsub_vf_f16m##LMUL(x, (__fp16)1.f, vl); \ + e = __riscv_vfsub_vf_f16m##LMUL##_mu(mask, e, e, (__fp16)1.f, vl); \ + \ + vfloat16m##LMUL##_t z = __riscv_vfmul_vv_f16m##LMUL(x, x, vl); \ + \ + vfloat16m##LMUL##_t y = __riscv_vfmul_vf_f16m##LMUL(x, (__fp16)c_cephes_log_p0, vl); \ + y = __riscv_vfadd_vf_f16m##LMUL(y, (__fp16)c_cephes_log_p1, vl); \ + y = __riscv_vfmul_vv_f16m##LMUL(y, x, vl); \ + y = __riscv_vfadd_vf_f16m##LMUL(y, (__fp16)c_cephes_log_p2, vl); \ + y = __riscv_vfmul_vv_f16m##LMUL(y, x, vl); \ + y = __riscv_vfadd_vf_f16m##LMUL(y, (__fp16)c_cephes_log_p3, vl); \ + y = __riscv_vfmul_vv_f16m##LMUL(y, x, vl); \ + y = __riscv_vfadd_vf_f16m##LMUL(y, (__fp16)c_cephes_log_p4, vl); \ + y = __riscv_vfmul_vv_f16m##LMUL(y, x, vl); \ + y = __riscv_vfadd_vf_f16m##LMUL(y, (__fp16)c_cephes_log_p5, vl); \ + y = __riscv_vfmul_vv_f16m##LMUL(y, x, vl); \ + y = __riscv_vfadd_vf_f16m##LMUL(y, (__fp16)c_cephes_log_p6, vl); \ + y = __riscv_vfmul_vv_f16m##LMUL(y, x, vl); \ + y = __riscv_vfadd_vf_f16m##LMUL(y, (__fp16)c_cephes_log_p7, vl); \ + y = __riscv_vfmul_vv_f16m##LMUL(y, x, vl); \ + y = __riscv_vfadd_vf_f16m##LMUL(y, (__fp16)c_cephes_log_p8, vl); \ + y = __riscv_vfmul_vv_f16m##LMUL(y, x, vl); \ + \ + y = __riscv_vfmul_vv_f16m##LMUL(y, z, vl); \ + \ + vfloat16m##LMUL##_t tmp = __riscv_vfmul_vf_f16m##LMUL(e, (__fp16)c_cephes_log_q1, vl); \ + y = __riscv_vfadd_vv_f16m##LMUL(y, tmp, vl); \ + \ + tmp = __riscv_vfmul_vf_f16m##LMUL(z, (__fp16)0.5f, vl); \ + y = __riscv_vfsub_vv_f16m##LMUL(y, tmp, vl); \ + \ + tmp = __riscv_vfmul_vf_f16m##LMUL(e, (__fp16)c_cephes_log_q2, vl); \ + x = __riscv_vfadd_vv_f16m##LMUL(x, y, vl); \ + x = __riscv_vfadd_vv_f16m##LMUL(x, tmp, vl); \ + /* negative arg will be NAN */ \ + vuint16m##LMUL##_t xtmp = __riscv_vreinterpret_v_f16m##LMUL##_u16m##LMUL(x); \ + x = __riscv_vreinterpret_v_u16m##LMUL##_f16m##LMUL(__riscv_vor_vx_u16m##LMUL##_mu(invalid_mask, xtmp, xtmp, 0xffff, vl)); \ + return x; \ } _RVV_FLOAT16_LOG_OP(1, 16) @@ -117,54 +117,54 @@ _RVV_FLOAT16_LOG_OP(8, 2) #define c_cephes_exp_p4 1.6666665459E-1 #define c_cephes_exp_p5 5.0000001201E-1 -#define _RVV_FLOAT16_EXP_OP(LMUL, MLEN) \ - static inline vfloat16m##LMUL##_t exp_ps(vfloat16m##LMUL##_t x, size_t vl) \ - { \ - vfloat16m##LMUL##_t tmp, fx; \ - \ - x = vfmin_vf_f16m##LMUL(x, c_exp_hi_f16, vl); \ - x = vfmax_vf_f16m##LMUL(x, c_exp_lo_f16, vl); \ - \ - /* express exp(x) as exp(g + n*log(2)) */ \ - fx = vfmacc_vf_f16m##LMUL(vfmv_v_f_f16m##LMUL(0.5f, vl), c_cephes_LOG2EF, x, vl); \ - \ - /* perform a floorf */ \ - tmp = vfcvt_f_x_v_f16m##LMUL(vfcvt_x_f_v_i16m##LMUL(fx, vl), vl); \ - \ - /* if greater, substract 1 */ \ - vbool##MLEN##_t mask = vmfgt_vv_f16m##LMUL##_b##MLEN(tmp, fx, vl); \ - fx = vfsub_vf_f16m##LMUL##_m(mask, tmp, tmp, 1.f, vl); \ - \ - tmp = vfmul_vf_f16m##LMUL(fx, c_cephes_exp_C1, vl); \ - vfloat16m##LMUL##_t z = vfmul_vf_f16m##LMUL(fx, c_cephes_exp_C2, vl); \ - x = vfsub_vv_f16m##LMUL(x, tmp, vl); \ - x = vfsub_vv_f16m##LMUL(x, z, vl); \ - \ - vfloat16m##LMUL##_t y = vfmul_vf_f16m##LMUL(x, c_cephes_exp_p0, vl); \ - z = vfmul_vv_f16m##LMUL(x, x, vl); \ - \ - y = vfadd_vf_f16m##LMUL(y, c_cephes_exp_p1, vl); \ - y = vfmul_vv_f16m##LMUL(y, x, vl); \ - y = vfadd_vf_f16m##LMUL(y, c_cephes_exp_p2, vl); \ - y = vfmul_vv_f16m##LMUL(y, x, vl); \ - y = vfadd_vf_f16m##LMUL(y, c_cephes_exp_p3, vl); \ - y = vfmul_vv_f16m##LMUL(y, x, vl); \ - y = vfadd_vf_f16m##LMUL(y, c_cephes_exp_p4, vl); \ - y = vfmul_vv_f16m##LMUL(y, x, vl); \ - y = vfadd_vf_f16m##LMUL(y, c_cephes_exp_p5, vl); \ - \ - y = vfmul_vv_f16m##LMUL(y, z, vl); \ - y = vfadd_vv_f16m##LMUL(y, x, vl); \ - y = vfadd_vf_f16m##LMUL(y, 1.f, vl); \ - \ - /* build 2^n */ \ - vint16m##LMUL##_t mm = vfcvt_x_f_v_i16m##LMUL(fx, vl); \ - mm = vadd_vx_i16m##LMUL(mm, 0xf, vl); \ - mm = vsll_vx_i16m##LMUL(mm, 10, vl); \ - vfloat16m##LMUL##_t pow2n = vreinterpret_v_i16m##LMUL##_f16m##LMUL(mm); \ - \ - y = vfmul_vv_f16m##LMUL(y, pow2n, vl); \ - return y; \ +#define _RVV_FLOAT16_EXP_OP(LMUL, MLEN) \ + static inline vfloat16m##LMUL##_t exp_ps(vfloat16m##LMUL##_t x, size_t vl) \ + { \ + vfloat16m##LMUL##_t tmp, fx; \ + \ + x = __riscv_vfmin_vf_f16m##LMUL(x, (__fp16)c_exp_hi_f16, vl); \ + x = __riscv_vfmax_vf_f16m##LMUL(x, (__fp16)c_exp_lo_f16, vl); \ + \ + /* express exp(x) as exp(g + n*log(2)) */ \ + fx = __riscv_vfmacc_vf_f16m##LMUL(__riscv_vfmv_v_f_f16m##LMUL((__fp16)0.5f, vl), (__fp16)c_cephes_LOG2EF, x, vl); \ + \ + /* perform a floorf */ \ + tmp = __riscv_vfcvt_f_x_v_f16m##LMUL(__riscv_vfcvt_x_f_v_i16m##LMUL(fx, vl), vl); \ + \ + /* if greater, substract 1 */ \ + vbool##MLEN##_t mask = __riscv_vmfgt_vv_f16m##LMUL##_b##MLEN(tmp, fx, vl); \ + fx = __riscv_vfsub_vf_f16m##LMUL##_mu(mask, tmp, tmp, (__fp16)1.f, vl); \ + \ + tmp = __riscv_vfmul_vf_f16m##LMUL(fx, (__fp16)c_cephes_exp_C1, vl); \ + vfloat16m##LMUL##_t z = __riscv_vfmul_vf_f16m##LMUL(fx, (__fp16)c_cephes_exp_C2, vl); \ + x = __riscv_vfsub_vv_f16m##LMUL(x, tmp, vl); \ + x = __riscv_vfsub_vv_f16m##LMUL(x, z, vl); \ + \ + vfloat16m##LMUL##_t y = __riscv_vfmul_vf_f16m##LMUL(x, (__fp16)c_cephes_exp_p0, vl); \ + z = __riscv_vfmul_vv_f16m##LMUL(x, x, vl); \ + \ + y = __riscv_vfadd_vf_f16m##LMUL(y, (__fp16)c_cephes_exp_p1, vl); \ + y = __riscv_vfmul_vv_f16m##LMUL(y, x, vl); \ + y = __riscv_vfadd_vf_f16m##LMUL(y, (__fp16)c_cephes_exp_p2, vl); \ + y = __riscv_vfmul_vv_f16m##LMUL(y, x, vl); \ + y = __riscv_vfadd_vf_f16m##LMUL(y, (__fp16)c_cephes_exp_p3, vl); \ + y = __riscv_vfmul_vv_f16m##LMUL(y, x, vl); \ + y = __riscv_vfadd_vf_f16m##LMUL(y, (__fp16)c_cephes_exp_p4, vl); \ + y = __riscv_vfmul_vv_f16m##LMUL(y, x, vl); \ + y = __riscv_vfadd_vf_f16m##LMUL(y, (__fp16)c_cephes_exp_p5, vl); \ + \ + y = __riscv_vfmul_vv_f16m##LMUL(y, z, vl); \ + y = __riscv_vfadd_vv_f16m##LMUL(y, x, vl); \ + y = __riscv_vfadd_vf_f16m##LMUL(y, (__fp16)1.f, vl); \ + \ + /* build 2^n */ \ + vint16m##LMUL##_t mm = __riscv_vfcvt_x_f_v_i16m##LMUL(fx, vl); \ + mm = __riscv_vadd_vx_i16m##LMUL(mm, 0xf, vl); \ + mm = __riscv_vsll_vx_i16m##LMUL(mm, 10, vl); \ + vfloat16m##LMUL##_t pow2n = __riscv_vreinterpret_v_i16m##LMUL##_f16m##LMUL(mm); \ + \ + y = __riscv_vfmul_vv_f16m##LMUL(y, pow2n, vl); \ + return y; \ } _RVV_FLOAT16_EXP_OP(1, 16) @@ -183,73 +183,73 @@ _RVV_FLOAT16_EXP_OP(8, 2) #define c_coscof_p2 4.166664568298827E-002 #define c_cephes_FOPI 1.27323954473516 // 4 / M_PI -#define _RVV_FLOAT16_SINCOS_OP(LMUL, MLEN) \ - static inline void sincos_ps(vfloat16m##LMUL##_t x, vfloat16m##LMUL##_t* ysin, vfloat16m##LMUL##_t* ycos, size_t vl) \ - { \ - /* any x */ \ - vfloat16m##LMUL##_t xmm1, xmm2, xmm3, y; \ - \ - vuint16m##LMUL##_t emm2; \ - \ - vbool##MLEN##_t sign_mask_sin, sign_mask_cos; \ - sign_mask_sin = vmflt_vf_f16m##LMUL##_b##MLEN(x, 0.f, vl); \ - x = vfsgnj_vf_f16m##LMUL(x, 1.f, vl); \ - \ - /* scale by 4/Pi */ \ - y = vfmul_vf_f16m##LMUL(x, c_cephes_FOPI, vl); \ - \ - /* store the integer part of y in mm0 */ \ - emm2 = vfcvt_xu_f_v_u16m##LMUL(y, vl); \ - /* j=(j+1) & (~1) (see the cephes sources) */ \ - emm2 = vadd_vx_u16m##LMUL(emm2, 1, vl); \ - emm2 = vand_vx_u16m##LMUL(emm2, ~1, vl); \ - y = vfcvt_f_xu_v_f16m##LMUL(emm2, vl); \ - \ - /* get the polynom selection mask */ \ - /* there is one polynom for 0 <= x <= Pi/4 */ \ - /* and another one for Pi/4 tmpx(vl); \ std::vector<__fp16> tmpy(vl); \ - vse16_v_f16m##LMUL(tmpx.data(), a, vl); \ - vse16_v_f16m##LMUL(tmpy.data(), b, vl); \ + __riscv_vse16_v_f16m##LMUL(tmpx.data(), a, vl); \ + __riscv_vse16_v_f16m##LMUL(tmpy.data(), b, vl); \ for (size_t i = 0; i < vl; i++) \ { \ tmpx[i] = (__fp16)atan2((float)tmpx[i], (float)tmpy[i]); \ } \ - return vle16_v_f16m##LMUL(tmpx.data(), vl); \ + return __riscv_vle16_v_f16m##LMUL(tmpx.data(), vl); \ } -_RVV_FLOAT16_ATAN2_OP(1, 32) -_RVV_FLOAT16_ATAN2_OP(2, 16) -_RVV_FLOAT16_ATAN2_OP(4, 8) -_RVV_FLOAT16_ATAN2_OP(8, 4) +_RVV_FLOAT16_ATAN2_OP(1, 16) +_RVV_FLOAT16_ATAN2_OP(2, 8) +_RVV_FLOAT16_ATAN2_OP(4, 4) +_RVV_FLOAT16_ATAN2_OP(8, 2) #endif // RVV_MATHFUN_FP16S_H diff --git a/src/layer/riscv/selu_riscv.cpp b/src/layer/riscv/selu_riscv.cpp index 36b55044aca..f131d4ac42f 100644 --- a/src/layer/riscv/selu_riscv.cpp +++ b/src/layer/riscv/selu_riscv.cpp @@ -23,6 +23,11 @@ namespace ncnn { int SELU_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const { +#if C906 + // FIXME -O3 leads illegal instruction + return SELU::forward_inplace(bottom_top_blob, opt); +#endif + int w = bottom_top_blob.w; int h = bottom_top_blob.h; int d = bottom_top_blob.d; @@ -39,17 +44,17 @@ int SELU_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const int n = size; while (n > 0) { - size_t vl = vsetvl_e32m8(n); - vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); - vbool4_t _lower = vmflt_vf_f32m8_b4(_p, 0.f, vl); - vbool4_t _higher = vmnot_m_b4(_lower, vl); + size_t vl = __riscv_vsetvl_e32m8(n); + vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr, vl); + vbool4_t _lower = __riscv_vmflt_vf_f32m8_b4(_p, 0.f, vl); + vbool4_t _higher = __riscv_vmnot_m_b4(_lower, vl); - _p = vfmul_vf_f32m8_m(_higher, _p, /*op1*/ _p, lambda, vl); + _p = __riscv_vfmul_vf_f32m8_mu(_higher, _p, _p, lambda, vl); vfloat32m8_t _nps = exp_ps(_p, vl); - _nps = vfsub_vf_f32m8_m(_lower, _p, /*op1*/ _nps, 1.f, vl); - _nps = vfmul_vf_f32m8_m(_lower, _p, /*op1*/ _nps, alphaxlambda, vl); + _nps = __riscv_vfsub_vf_f32m8_mu(_lower, _p, _nps, 1.f, vl); + _nps = __riscv_vfmul_vf_f32m8_mu(_lower, _p, _nps, alphaxlambda, vl); - vse32_v_f32m8(ptr, _nps, vl); + __riscv_vse32_v_f32m8(ptr, _nps, vl); ptr += vl; n -= vl; } diff --git a/src/layer/riscv/sigmoid_riscv.cpp b/src/layer/riscv/sigmoid_riscv.cpp index 14770f95e78..e929ba82a4e 100644 --- a/src/layer/riscv/sigmoid_riscv.cpp +++ b/src/layer/riscv/sigmoid_riscv.cpp @@ -17,24 +17,29 @@ #if __riscv_vector #include #include "rvv_mathfun.h" -#include "rvv_mathfun_fp16s.h" #endif // __riscv_vector +#include "cpu.h" + namespace ncnn { Sigmoid_riscv::Sigmoid_riscv() { #if __riscv_vector support_packing = true; -#if __riscv_zfh - support_fp16_storage = true; -#endif #endif // __riscv_vector +#if NCNN_ZFH +#if __riscv_vector + support_fp16_storage = cpu_support_riscv_zvfh(); +#else + support_fp16_storage = cpu_support_riscv_zfh(); +#endif +#endif } int Sigmoid_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const { -#if __riscv_vector && __riscv_zfh +#if NCNN_ZFH int elembits = bottom_top_blob.elembits(); if (opt.use_fp16_storage && elembits == 16) @@ -62,11 +67,11 @@ int Sigmoid_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons int n = size; while (n > 0) { - size_t vl = vsetvl_e32m8(n); + size_t vl = __riscv_vsetvl_e32m8(n); - vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); + vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr, vl); _p = sigmoid_ps(_p, vl); - vse32_v_f32m8(ptr, _p, vl); + __riscv_vse32_v_f32m8(ptr, _p, vl); ptr += vl; n -= vl; @@ -84,68 +89,4 @@ int Sigmoid_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons return 0; } -#if __riscv_vector && __riscv_zfh -int Sigmoid_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const -{ - int w = bottom_top_blob.w; - int h = bottom_top_blob.h; - int d = bottom_top_blob.d; - int channels = bottom_top_blob.c; - int elempack = bottom_top_blob.elempack; - int size = w * h * d * elempack; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - __fp16* ptr = bottom_top_blob.channel(q); - - int n = size; - while (n > 0) - { - size_t vl = vsetvl_e16m4(n); - - vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr, vl), vl); - _p = sigmoid_ps(_p, vl); - vse16_v_f16m4(ptr, vfncvt_f_f_w_f16m4(_p, vl), vl); - - ptr += vl; - n -= vl; - } - } - - return 0; -} - -int Sigmoid_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const -{ - int w = bottom_top_blob.w; - int h = bottom_top_blob.h; - int d = bottom_top_blob.d; - int channels = bottom_top_blob.c; - int elempack = bottom_top_blob.elempack; - int size = w * h * d * elempack; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - __fp16* ptr = bottom_top_blob.channel(q); - - int n = size; - while (n > 0) - { - size_t vl = vsetvl_e16m8(n); - - vfloat16m8_t _p = vle16_v_f16m8(ptr, vl); - _p = sigmoid_ps(_p, vl); - vse16_v_f16m8(ptr, _p, vl); - - ptr += vl; - n -= vl; - } - } - - return 0; -} -#endif // __riscv_vector && __riscv_zfh - } // namespace ncnn diff --git a/src/layer/riscv/sigmoid_riscv.h b/src/layer/riscv/sigmoid_riscv.h index 8f014e6c4f2..67378486789 100644 --- a/src/layer/riscv/sigmoid_riscv.h +++ b/src/layer/riscv/sigmoid_riscv.h @@ -27,7 +27,7 @@ class Sigmoid_riscv : public Sigmoid virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; protected: -#if __riscv_vector && __riscv_zfh +#if NCNN_ZFH int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const; int forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const; #endif diff --git a/src/layer/riscv/sigmoid_riscv_zfh.cpp b/src/layer/riscv/sigmoid_riscv_zfh.cpp new file mode 100644 index 00000000000..f4806f64e8f --- /dev/null +++ b/src/layer/riscv/sigmoid_riscv_zfh.cpp @@ -0,0 +1,109 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "sigmoid_riscv.h" + +#if __riscv_vector +#include +#include "rvv_mathfun.h" +#if __riscv_zvfh +#include "rvv_mathfun_fp16s.h" +#endif +#endif // __riscv_vector + +namespace ncnn { + +#if NCNN_ZFH +int Sigmoid_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const +{ + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + int d = bottom_top_blob.d; + int channels = bottom_top_blob.c; + int elempack = bottom_top_blob.elempack; + int size = w * h * d * elempack; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + __fp16* ptr = bottom_top_blob.channel(q); + +#if __riscv_zvfh + int n = size; + while (n > 0) + { + size_t vl = __riscv_vsetvl_e16m4(n); + + vfloat32m8_t _p = __riscv_vfwcvt_f_f_v_f32m8(__riscv_vle16_v_f16m4(ptr, vl), vl); + _p = sigmoid_ps(_p, vl); + __riscv_vse16_v_f16m4(ptr, __riscv_vfncvt_f_f_w_f16m4(_p, vl), vl); + + ptr += vl; + n -= vl; + } +#else // __riscv_zvfh + for (int i = 0; i < size; i++) + { + *ptr = (__fp16)(1.f / (1.f + exp(-(float)*ptr))); + + ptr++; + } +#endif // __riscv_zvfh + } + + return 0; +} + +int Sigmoid_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const +{ + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + int d = bottom_top_blob.d; + int channels = bottom_top_blob.c; + int elempack = bottom_top_blob.elempack; + int size = w * h * d * elempack; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + __fp16* ptr = bottom_top_blob.channel(q); + +#if __riscv_zvfh + int n = size; + while (n > 0) + { + size_t vl = __riscv_vsetvl_e16m8(n); + + vfloat16m8_t _p = __riscv_vle16_v_f16m8(ptr, vl); + _p = sigmoid_ps(_p, vl); + __riscv_vse16_v_f16m8(ptr, _p, vl); + + ptr += vl; + n -= vl; + } +#else // __riscv_zvfh + for (int i = 0; i < size; i++) + { + *ptr = (__fp16)1.f / ((__fp16)1.f + (__fp16)exp((float)-*ptr)); + + ptr++; + } +#endif // __riscv_zvfh + } + + return 0; +} +#endif // NCNN_ZFH + +} // namespace ncnn diff --git a/src/layer/riscv/softmax_riscv.cpp b/src/layer/riscv/softmax_riscv.cpp index ca910c3d3c0..a53b05dffef 100644 --- a/src/layer/riscv/softmax_riscv.cpp +++ b/src/layer/riscv/softmax_riscv.cpp @@ -33,7 +33,7 @@ int Softmax_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons int elempack = bottom_top_blob.elempack; int positive_axis = axis < 0 ? dims + axis : axis; -#ifdef __riscv_vector +#if __riscv_vector if (dims == 1) // positive_axis == 0 { int w = bottom_top_blob.w; @@ -44,13 +44,13 @@ int Softmax_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons float* ptr_vol = ptr; while (n > 0) { - size_t vl = vsetvl_e32m8(n); + size_t vl = __riscv_vsetvl_e32m8(n); - vfloat32m8_t _p = vle32_v_f32m8(ptr_vol, vl); - vfloat32m1_t _max = vfmv_s_f_f32m1(vundefined_f32m1(), max, vl); - _max = vfredmax_vs_f32m8_f32m1(_max, _p, /* scalar*/ _max, vl); + vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr_vol, vl); + vfloat32m1_t _max = __riscv_vfmv_s_f_f32m1(max, vl); + _max = __riscv_vfredmax_vs_f32m8_f32m1(_p, _max, vl); - max = vfmv_f_s_f32m1_f32(_max); + max = __riscv_vfmv_f_s_f32m1_f32(_max); ptr_vol += vl; n -= vl; } @@ -61,16 +61,16 @@ int Softmax_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons ptr_vol = ptr; while (n > 0) { - size_t vl = vsetvl_e32m8(n); - vfloat32m1_t _sum = vfmv_s_f_f32m1(vundefined_f32m1(), sum, vl); - vfloat32m8_t _p = vle32_v_f32m8(ptr_vol, vl); + size_t vl = __riscv_vsetvl_e32m8(n); + vfloat32m1_t _sum = __riscv_vfmv_s_f_f32m1(sum, vl); + vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr_vol, vl); - _p = vfsub_vf_f32m8(_p, max, vl); + _p = __riscv_vfsub_vf_f32m8(_p, max, vl); _p = exp_ps(_p, vl); - _sum = vfredusum_vs_f32m8_f32m1(_sum, _p, /*scalar*/ _sum, vl); + _sum = __riscv_vfredusum_vs_f32m8_f32m1(_p, _sum, vl); - vse32_v_f32m8(ptr_vol, _p, vl); - sum = vfmv_f_s_f32m1_f32(_sum); + __riscv_vse32_v_f32m8(ptr_vol, _p, vl); + sum = __riscv_vfmv_f_s_f32m1_f32(_sum); ptr_vol += vl; n -= vl; } @@ -80,11 +80,11 @@ int Softmax_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons ptr_vol = ptr; while (n > 0) { - size_t vl = vsetvl_e32m8(n); + size_t vl = __riscv_vsetvl_e32m8(n); - vfloat32m8_t _p = vle32_v_f32m8(ptr_vol, vl); - _p = vfdiv_vf_f32m8(_p, sum, vl); - vse32_v_f32m8(ptr_vol, _p, vl); + vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr_vol, vl); + _p = __riscv_vfdiv_vf_f32m8(_p, sum, vl); + __riscv_vse32_v_f32m8(ptr_vol, _p, vl); n -= vl; ptr_vol += vl; @@ -112,14 +112,14 @@ int Softmax_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons int n = w * elempack; while (n > 0) { - size_t vl = vsetvl_e32m8(n); + size_t vl = __riscv_vsetvl_e32m8(n); - vfloat32m8_t _max = vle32_v_f32m8(ptr_max, vl); - vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); + vfloat32m8_t _max = __riscv_vle32_v_f32m8(ptr_max, vl); + vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr, vl); - _max = vfmax_vv_f32m8(_max, _p, vl); + _max = __riscv_vfmax_vv_f32m8(_max, _p, vl); - vse32_v_f32m8(ptr_max, _max, vl); + __riscv_vse32_v_f32m8(ptr_max, _max, vl); ptr += vl; ptr_max += vl; n -= vl; @@ -141,18 +141,18 @@ int Softmax_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons while (n > 0) { - size_t vl = vsetvl_e32m8(n); + size_t vl = __riscv_vsetvl_e32m8(n); - vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); - vfloat32m8_t _max = vle32_v_f32m8(ptr_max, vl); - vfloat32m8_t _sum = vle32_v_f32m8(ptr_sum, vl); + vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr, vl); + vfloat32m8_t _max = __riscv_vle32_v_f32m8(ptr_max, vl); + vfloat32m8_t _sum = __riscv_vle32_v_f32m8(ptr_sum, vl); - _p = vfsub_vv_f32m8(_p, _max, vl); + _p = __riscv_vfsub_vv_f32m8(_p, _max, vl); _p = exp_ps(_p, vl); - _sum = vfadd_vv_f32m8(_sum, _p, vl); + _sum = __riscv_vfadd_vv_f32m8(_sum, _p, vl); - vse32_v_f32m8(ptr, _p, vl); - vse32_v_f32m8(ptr_sum, _sum, vl); + __riscv_vse32_v_f32m8(ptr, _p, vl); + __riscv_vse32_v_f32m8(ptr_sum, _sum, vl); n -= vl; ptr_max += vl; ptr_sum += vl; @@ -168,13 +168,13 @@ int Softmax_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons int n = w * elempack; while (n > 0) { - size_t vl = vsetvl_e32m8(n); - vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); - vfloat32m8_t _sum = vle32_v_f32m8(ptr_sum, vl); + size_t vl = __riscv_vsetvl_e32m8(n); + vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr, vl); + vfloat32m8_t _sum = __riscv_vle32_v_f32m8(ptr_sum, vl); - _p = vfdiv_vv_f32m8(_p, _sum, vl); + _p = __riscv_vfdiv_vv_f32m8(_p, _sum, vl); - vse32_v_f32m8(ptr, _p, vl); + __riscv_vse32_v_f32m8(ptr, _p, vl); n -= vl; ptr += vl; ptr_sum += vl; @@ -198,13 +198,13 @@ int Softmax_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons float* ptr1 = ptr; while (n1 > 0) { - size_t vl = vsetvl_e32m8(n1); - vfloat32m8_t _p = vle32_v_f32m8(ptr1, vl); - vfloat32m1_t _m = vfmv_s_f_f32m1(vundefined_f32m1(), m, vl); + size_t vl = __riscv_vsetvl_e32m8(n1); + vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr1, vl); + vfloat32m1_t _m = __riscv_vfmv_s_f_f32m1(m, vl); - _m = vfredmax_vs_f32m8_f32m1(_m, _p, _m, vl); + _m = __riscv_vfredmax_vs_f32m8_f32m1(_p, _m, vl); - m = vfmv_f_s_f32m1_f32(_m); + m = __riscv_vfmv_f_s_f32m1_f32(_m); ptr1 += vl; n1 -= vl; } @@ -215,15 +215,15 @@ int Softmax_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons float* ptr2 = ptr; while (n2 > 0) { - size_t vl = vsetvl_e32m8(n2); - vfloat32m8_t _p = vle32_v_f32m8(ptr2, vl); - vfloat32m1_t _s = vfmv_s_f_f32m1(vundefined_f32m1(), s, vl); + size_t vl = __riscv_vsetvl_e32m8(n2); + vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr2, vl); + vfloat32m1_t _s = __riscv_vfmv_s_f_f32m1(s, vl); - _p = exp_ps(vfsub_vf_f32m8(_p, m, vl), vl); - _s = vfredusum_vs_f32m8_f32m1(_s, _p, _s, vl); + _p = exp_ps(__riscv_vfsub_vf_f32m8(_p, m, vl), vl); + _s = __riscv_vfredusum_vs_f32m8_f32m1(_p, _s, vl); - vse32_v_f32m8(ptr2, _p, vl); - s = vfmv_f_s_f32m1_f32(_s); + __riscv_vse32_v_f32m8(ptr2, _p, vl); + s = __riscv_vfmv_f_s_f32m1_f32(_s); ptr2 += vl; n2 -= vl; } @@ -233,13 +233,13 @@ int Softmax_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons float* ptr3 = ptr; while (n3 > 0) { - size_t vl = vsetvl_e32m8(n3); + size_t vl = __riscv_vsetvl_e32m8(n3); - vfloat32m8_t _p = vle32_v_f32m8(ptr3, vl); + vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr3, vl); - _p = vfdiv_vf_f32m8(_p, s, vl); + _p = __riscv_vfdiv_vf_f32m8(_p, s, vl); - vse32_v_f32m8(ptr3, _p, vl); + __riscv_vse32_v_f32m8(ptr3, _p, vl); n3 -= vl; ptr3 += vl; } @@ -269,12 +269,12 @@ int Softmax_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons int n = size * elempack; while (n > 0) { - size_t vl = vsetvl_e32m8(n); + size_t vl = __riscv_vsetvl_e32m8(n); - vfloat32m8_t _max = vle32_v_f32m8(max, vl); - vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); - _max = vfmax_vv_f32m8(_max, _p, vl); - vse32_v_f32m8(ptr_max, _max, vl); + vfloat32m8_t _max = __riscv_vle32_v_f32m8(max, vl); + vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr, vl); + _max = __riscv_vfmax_vv_f32m8(_max, _p, vl); + __riscv_vse32_v_f32m8(ptr_max, _max, vl); ptr += vl; ptr_max += vl; @@ -295,14 +295,14 @@ int Softmax_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons int n = size * elempack; while (n > 0) { - size_t vl = vsetvl_e32m8(n); - vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); - vfloat32m8_t _max = vle32_v_f32m8(ptr_max, vl); - vfloat32m8_t _sum = vle32_v_f32m8(ptr_sum, vl); - _p = exp_ps(vfsub_vv_f32m8(_p, _max, vl), vl); - _sum = vfadd_vv_f32m8(_sum, _p, vl); - vse32_v_f32m8(ptr, _p, vl); - vse32_v_f32m8(ptr_sum, _sum, vl); + size_t vl = __riscv_vsetvl_e32m8(n); + vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr, vl); + vfloat32m8_t _max = __riscv_vle32_v_f32m8(ptr_max, vl); + vfloat32m8_t _sum = __riscv_vle32_v_f32m8(ptr_sum, vl); + _p = exp_ps(__riscv_vfsub_vv_f32m8(_p, _max, vl), vl); + _sum = __riscv_vfadd_vv_f32m8(_sum, _p, vl); + __riscv_vse32_v_f32m8(ptr, _p, vl); + __riscv_vse32_v_f32m8(ptr_sum, _sum, vl); n -= vl; ptr += vl; @@ -319,12 +319,12 @@ int Softmax_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons int n = size * elempack; while (n > 0) { - size_t vl = vsetvl_e32m8(n); - vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); - vfloat32m8_t _sum = vle32_v_f32m8(ptr_sum, vl); + size_t vl = __riscv_vsetvl_e32m8(n); + vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr, vl); + vfloat32m8_t _sum = __riscv_vle32_v_f32m8(ptr_sum, vl); - _p = vfdiv_vv_f32m8(_p, _sum, vl); - vse32_v_f32m8(ptr, _p, vl); + _p = __riscv_vfdiv_vv_f32m8(_p, _sum, vl); + __riscv_vse32_v_f32m8(ptr, _p, vl); ptr_sum += vl; ptr += vl; @@ -358,12 +358,12 @@ int Softmax_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons int n = w * elempack; while (n > 0) { - size_t vl = vsetvl_e32m8(n); - vfloat32m8_t _maxptr = vle32_v_f32m8(maxptr_vol, vl); - vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); + size_t vl = __riscv_vsetvl_e32m8(n); + vfloat32m8_t _maxptr = __riscv_vle32_v_f32m8(maxptr_vol, vl); + vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr, vl); - _maxptr = vfmax_vv_f32m8(_maxptr, _p, vl); - vse32_v_f32m8(maxptr_vol, _maxptr, vl); + _maxptr = __riscv_vfmax_vv_f32m8(_maxptr, _p, vl); + __riscv_vse32_v_f32m8(maxptr_vol, _maxptr, vl); ptr += vl; maxptr_vol += vl; @@ -392,16 +392,16 @@ int Softmax_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons while (n) { - size_t vl = vsetvl_e32m8(n); - vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); - vfloat32m8_t _maxptr = vle32_v_f32m8(maxptr_vol, vl); - vfloat32m8_t _sumptr = vle32_v_f32m8(sumptr_vol, vl); + size_t vl = __riscv_vsetvl_e32m8(n); + vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr, vl); + vfloat32m8_t _maxptr = __riscv_vle32_v_f32m8(maxptr_vol, vl); + vfloat32m8_t _sumptr = __riscv_vle32_v_f32m8(sumptr_vol, vl); - _p = exp_ps(vfsub_vv_f32m8(_p, _maxptr, vl), vl); - _sumptr = vfadd_vv_f32m8(_sumptr, _p, vl); + _p = exp_ps(__riscv_vfsub_vv_f32m8(_p, _maxptr, vl), vl); + _sumptr = __riscv_vfadd_vv_f32m8(_sumptr, _p, vl); - vse32_v_f32m8(ptr, _p, vl); - vse32_v_f32m8(sumptr_vol, _sumptr, vl); + __riscv_vse32_v_f32m8(ptr, _p, vl); + __riscv_vse32_v_f32m8(sumptr_vol, _sumptr, vl); n -= vl; sumptr_vol += vl; maxptr_vol += vl; @@ -422,13 +422,13 @@ int Softmax_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons int n = w * elempack; while (n > 0) { - size_t vl = vsetvl_e32m8(n); - vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); - vfloat32m8_t _sumptr = vle32_v_f32m8(sumptr_vol, vl); + size_t vl = __riscv_vsetvl_e32m8(n); + vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr, vl); + vfloat32m8_t _sumptr = __riscv_vle32_v_f32m8(sumptr_vol, vl); - _p = vfdiv_vv_f32m8(_p, _sumptr, vl); + _p = __riscv_vfdiv_vv_f32m8(_p, _sumptr, vl); - vse32_v_f32m8(ptr, _p, vl); + __riscv_vse32_v_f32m8(ptr, _p, vl); n -= vl; sumptr_vol += vl; ptr += vl; @@ -457,12 +457,12 @@ int Softmax_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons float* ptr_1 = ptr; while (n1 > 0) { - size_t vl = vsetvl_e32m8(n1); - vfloat32m8_t _p = vle32_v_f32m8(ptr_1, vl); - vfloat32m1_t _scalar_max = vfmv_s_f_f32m1(vundefined_f32m1(), max, vl); - _scalar_max = vfredmax_vs_f32m8_f32m1(_scalar_max, _p, _scalar_max, vl); + size_t vl = __riscv_vsetvl_e32m8(n1); + vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr_1, vl); + vfloat32m1_t _scalar_max = __riscv_vfmv_s_f_f32m1(max, vl); + _scalar_max = __riscv_vfredmax_vs_f32m8_f32m1(_p, _scalar_max, vl); - max = vfmv_f_s_f32m1_f32(_scalar_max); + max = __riscv_vfmv_f_s_f32m1_f32(_scalar_max); n1 -= vl; ptr_1 += vl; } @@ -473,15 +473,15 @@ int Softmax_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons float* ptr_2 = ptr; while (n2 > 0) { - size_t vl = vsetvl_e32m8(n2); - vfloat32m8_t _p = vle32_v_f32m8(ptr_2, vl); - vfloat32m1_t _scalar_sum = vfmv_s_f_f32m1(vundefined_f32m1(), sum, vl); + size_t vl = __riscv_vsetvl_e32m8(n2); + vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr_2, vl); + vfloat32m1_t _scalar_sum = __riscv_vfmv_s_f_f32m1(sum, vl); - _p = exp_ps(vfsub_vf_f32m8(_p, max, vl), vl); - _scalar_sum = vfredusum_vs_f32m8_f32m1(_scalar_sum, _p, _scalar_sum, vl); + _p = exp_ps(__riscv_vfsub_vf_f32m8(_p, max, vl), vl); + _scalar_sum = __riscv_vfredusum_vs_f32m8_f32m1(_p, _scalar_sum, vl); - vse32_v_f32m8(ptr_2, _p, vl); - sum = vfmv_f_s_f32m1_f32(_scalar_sum); + __riscv_vse32_v_f32m8(ptr_2, _p, vl); + sum = __riscv_vfmv_f_s_f32m1_f32(_scalar_sum); n2 -= vl; ptr_2 += vl; } @@ -491,12 +491,12 @@ int Softmax_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons float* ptr_3 = ptr; while (n3 > 0) { - size_t vl = vsetvl_e32m8(n3); - vfloat32m8_t _p = vle32_v_f32m8(ptr_3, vl); + size_t vl = __riscv_vsetvl_e32m8(n3); + vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr_3, vl); - _p = vfdiv_vf_f32m8(_p, sum, vl); + _p = __riscv_vfdiv_vf_f32m8(_p, sum, vl); - vse32_v_f32m8(ptr_3, _p, vl); + __riscv_vse32_v_f32m8(ptr_3, _p, vl); n3 -= vl; ptr_3 += vl; } diff --git a/src/layer/riscv/swish_riscv.cpp b/src/layer/riscv/swish_riscv.cpp index 7e2e2488c42..f09e4065d2a 100644 --- a/src/layer/riscv/swish_riscv.cpp +++ b/src/layer/riscv/swish_riscv.cpp @@ -17,24 +17,29 @@ #if __riscv_vector #include #include "rvv_mathfun.h" -#include "rvv_mathfun_fp16s.h" #endif // __riscv_vector +#include "cpu.h" + namespace ncnn { Swish_riscv::Swish_riscv() { #if __riscv_vector support_packing = true; -#if __riscv_zfh - support_fp16_storage = true; -#endif #endif // __riscv_vector +#if NCNN_ZFH +#if __riscv_vector + support_fp16_storage = cpu_support_riscv_zvfh(); +#else + support_fp16_storage = cpu_support_riscv_zfh(); +#endif +#endif } int Swish_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const { -#if __riscv_vector && __riscv_zfh +#if NCNN_ZFH int elembits = bottom_top_blob.elembits(); if (opt.use_fp16_storage && elembits == 16) @@ -62,11 +67,11 @@ int Swish_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const int n = size; while (n > 0) { - size_t vl = vsetvl_e32m8(n); + size_t vl = __riscv_vsetvl_e32m8(n); - vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); - _p = vfdiv_vv_f32m8(_p, vfadd_vf_f32m8(exp_ps(vfneg_v_f32m8(_p, vl), vl), 1.f, vl), vl); - vse32_v_f32m8(ptr, _p, vl); + vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr, vl); + _p = __riscv_vfdiv_vv_f32m8(_p, __riscv_vfadd_vf_f32m8(exp_ps(__riscv_vfneg_v_f32m8(_p, vl), vl), 1.f, vl), vl); + __riscv_vse32_v_f32m8(ptr, _p, vl); ptr += vl; n -= vl; @@ -83,68 +88,4 @@ int Swish_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const return 0; } -#if __riscv_vector && __riscv_zfh -int Swish_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const -{ - int w = bottom_top_blob.w; - int h = bottom_top_blob.h; - int d = bottom_top_blob.d; - int channels = bottom_top_blob.c; - int elempack = bottom_top_blob.elempack; - int size = w * h * d * elempack; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - __fp16* ptr = bottom_top_blob.channel(q); - - int n = size; - while (n > 0) - { - size_t vl = vsetvl_e16m4(n); - - vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr, vl), vl); - _p = vfdiv_vv_f32m8(_p, vfadd_vf_f32m8(exp_ps(vfneg_v_f32m8(_p, vl), vl), 1.f, vl), vl); - vse16_v_f16m4(ptr, vfncvt_f_f_w_f16m4(_p, vl), vl); - - ptr += vl; - n -= vl; - } - } - - return 0; -} - -int Swish_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const -{ - int w = bottom_top_blob.w; - int h = bottom_top_blob.h; - int d = bottom_top_blob.d; - int channels = bottom_top_blob.c; - int elempack = bottom_top_blob.elempack; - int size = w * h * d * elempack; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - __fp16* ptr = bottom_top_blob.channel(q); - - int n = size; - while (n > 0) - { - size_t vl = vsetvl_e16m8(n); - - vfloat16m8_t _p = vle16_v_f16m8(ptr, vl); - _p = vfdiv_vv_f16m8(_p, vfadd_vf_f16m8(exp_ps(vfneg_v_f16m8(_p, vl), vl), 1.f, vl), vl); - vse16_v_f16m8(ptr, _p, vl); - - ptr += vl; - n -= vl; - } - } - - return 0; -} -#endif // __riscv_vector && __riscv_zfh - } // namespace ncnn diff --git a/src/layer/riscv/swish_riscv.h b/src/layer/riscv/swish_riscv.h index 05d5cbe1cfd..971b5cb2b40 100644 --- a/src/layer/riscv/swish_riscv.h +++ b/src/layer/riscv/swish_riscv.h @@ -27,7 +27,7 @@ class Swish_riscv : public Swish virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; protected: -#if __riscv_vector && __riscv_zfh +#if NCNN_ZFH int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const; int forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const; #endif diff --git a/src/layer/riscv/swish_riscv_zfh.cpp b/src/layer/riscv/swish_riscv_zfh.cpp new file mode 100644 index 00000000000..2fa4e028cd2 --- /dev/null +++ b/src/layer/riscv/swish_riscv_zfh.cpp @@ -0,0 +1,108 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "swish_riscv.h" + +#if __riscv_vector +#include +#include "rvv_mathfun.h" +#if __riscv_zvfh +#include "rvv_mathfun_fp16s.h" +#endif +#endif // __riscv_vector + +namespace ncnn { + +#if NCNN_ZFH +int Swish_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const +{ + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + int d = bottom_top_blob.d; + int channels = bottom_top_blob.c; + int elempack = bottom_top_blob.elempack; + int size = w * h * d * elempack; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + __fp16* ptr = bottom_top_blob.channel(q); + +#if __riscv_zvfh + int n = size; + while (n > 0) + { + size_t vl = __riscv_vsetvl_e16m4(n); + + vfloat32m8_t _p = __riscv_vfwcvt_f_f_v_f32m8(__riscv_vle16_v_f16m4(ptr, vl), vl); + _p = __riscv_vfdiv_vv_f32m8(_p, __riscv_vfadd_vf_f32m8(exp_ps(__riscv_vfneg_v_f32m8(_p, vl), vl), (__fp16)1.f, vl), vl); + __riscv_vse16_v_f16m4(ptr, __riscv_vfncvt_f_f_w_f16m4(_p, vl), vl); + + ptr += vl; + n -= vl; + } +#else // __riscv_zvfh + for (int i = 0; i < size; i++) + { + float v = (float)*ptr; + *ptr = (__fp16)(v / (1.f + exp(-v))); + ptr++; + } +#endif // __riscv_zvfh + } + + return 0; +} + +int Swish_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const +{ + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + int d = bottom_top_blob.d; + int channels = bottom_top_blob.c; + int elempack = bottom_top_blob.elempack; + int size = w * h * d * elempack; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + __fp16* ptr = bottom_top_blob.channel(q); + +#if __riscv_zvfh + int n = size; + while (n > 0) + { + size_t vl = __riscv_vsetvl_e16m8(n); + + vfloat16m8_t _p = __riscv_vle16_v_f16m8(ptr, vl); + _p = __riscv_vfdiv_vv_f16m8(_p, __riscv_vfadd_vf_f16m8(exp_ps(__riscv_vfneg_v_f16m8(_p, vl), vl), (__fp16)1.f, vl), vl); + __riscv_vse16_v_f16m8(ptr, _p, vl); + + ptr += vl; + n -= vl; + } +#else // __riscv_zvfh + for (int i = 0; i < size; i++) + { + *ptr = *ptr / ((__fp16)1.f + (__fp16)exp((float)-*ptr)); + ptr++; + } +#endif // __riscv_zvfh + } + + return 0; +} +#endif // NCNN_ZFH + +} // namespace ncnn diff --git a/src/layer/riscv/tanh_riscv.cpp b/src/layer/riscv/tanh_riscv.cpp index 0c147b15bd6..8c3bdb22b3c 100644 --- a/src/layer/riscv/tanh_riscv.cpp +++ b/src/layer/riscv/tanh_riscv.cpp @@ -17,24 +17,29 @@ #if __riscv_vector #include #include "rvv_mathfun.h" -#include "rvv_mathfun_fp16s.h" #endif // __riscv_vector +#include "cpu.h" + namespace ncnn { TanH_riscv::TanH_riscv() { #if __riscv_vector support_packing = true; -#if __riscv_zfh - support_fp16_storage = true; -#endif #endif // __riscv_vector +#if NCNN_ZFH +#if __riscv_vector + support_fp16_storage = cpu_support_riscv_zvfh(); +#else + support_fp16_storage = cpu_support_riscv_zfh(); +#endif +#endif } int TanH_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const { -#if __riscv_vector && __riscv_zfh +#if NCNN_ZFH int elembits = bottom_top_blob.elembits(); if (opt.use_fp16_storage && elembits == 16) @@ -62,11 +67,11 @@ int TanH_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const int n = size; while (n > 0) { - size_t vl = vsetvl_e32m8(n); + size_t vl = __riscv_vsetvl_e32m8(n); - vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); + vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr, vl); _p = tanh_ps(_p, vl); - vse32_v_f32m8(ptr, _p, vl); + __riscv_vse32_v_f32m8(ptr, _p, vl); ptr += vl; n -= vl; @@ -83,68 +88,4 @@ int TanH_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const return 0; } -#if __riscv_vector && __riscv_zfh -int TanH_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const -{ - int w = bottom_top_blob.w; - int h = bottom_top_blob.h; - int d = bottom_top_blob.d; - int channels = bottom_top_blob.c; - int elempack = bottom_top_blob.elempack; - int size = w * h * d * elempack; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - __fp16* ptr = bottom_top_blob.channel(q); - - int n = size; - while (n > 0) - { - size_t vl = vsetvl_e16m4(n); - - vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr, vl), vl); - _p = tanh_ps(_p, vl); - vse16_v_f16m4(ptr, vfncvt_f_f_w_f16m4(_p, vl), vl); - - ptr += vl; - n -= vl; - } - } - - return 0; -} - -int TanH_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const -{ - int w = bottom_top_blob.w; - int h = bottom_top_blob.h; - int d = bottom_top_blob.d; - int channels = bottom_top_blob.c; - int elempack = bottom_top_blob.elempack; - int size = w * h * d * elempack; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - __fp16* ptr = bottom_top_blob.channel(q); - - int n = size; - while (n > 0) - { - size_t vl = vsetvl_e16m8(n); - - vfloat16m8_t _p = vle16_v_f16m8(ptr, vl); - _p = tanh_ps(_p, vl); - vse16_v_f16m8(ptr, _p, vl); - - ptr += vl; - n -= vl; - } - } - - return 0; -} -#endif // __riscv_vector && __riscv_zfh - } // namespace ncnn diff --git a/src/layer/riscv/tanh_riscv.h b/src/layer/riscv/tanh_riscv.h index 6fb22ce91f3..69cb0d4e7cc 100644 --- a/src/layer/riscv/tanh_riscv.h +++ b/src/layer/riscv/tanh_riscv.h @@ -27,7 +27,7 @@ class TanH_riscv : public TanH virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; protected: -#if __riscv_vector && __riscv_zfh +#if NCNN_ZFH int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const; int forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const; #endif diff --git a/src/layer/riscv/tanh_riscv_zfh.cpp b/src/layer/riscv/tanh_riscv_zfh.cpp new file mode 100644 index 00000000000..6cdb9113231 --- /dev/null +++ b/src/layer/riscv/tanh_riscv_zfh.cpp @@ -0,0 +1,107 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "tanh_riscv.h" + +#if __riscv_vector +#include +#include "rvv_mathfun.h" +#if __riscv_zvfh +#include "rvv_mathfun_fp16s.h" +#endif +#endif // __riscv_vector + +namespace ncnn { + +#if NCNN_ZFH +int TanH_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const +{ + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + int d = bottom_top_blob.d; + int channels = bottom_top_blob.c; + int elempack = bottom_top_blob.elempack; + int size = w * h * d * elempack; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + __fp16* ptr = bottom_top_blob.channel(q); + +#if __riscv_zvfh + int n = size; + while (n > 0) + { + size_t vl = __riscv_vsetvl_e16m4(n); + + vfloat32m8_t _p = __riscv_vfwcvt_f_f_v_f32m8(__riscv_vle16_v_f16m4(ptr, vl), vl); + _p = tanh_ps(_p, vl); + __riscv_vse16_v_f16m4(ptr, __riscv_vfncvt_f_f_w_f16m4(_p, vl), vl); + + ptr += vl; + n -= vl; + } +#else // __riscv_zvfh + for (int i = 0; i < size; i++) + { + *ptr = (__fp16)tanh((float)*ptr); + ptr++; + } +#endif // __riscv_zvfh + } + + return 0; +} + +int TanH_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const +{ + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + int d = bottom_top_blob.d; + int channels = bottom_top_blob.c; + int elempack = bottom_top_blob.elempack; + int size = w * h * d * elempack; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + __fp16* ptr = bottom_top_blob.channel(q); + +#if __riscv_zvfh + int n = size; + while (n > 0) + { + size_t vl = __riscv_vsetvl_e16m8(n); + + vfloat16m8_t _p = __riscv_vle16_v_f16m8(ptr, vl); + _p = tanh_ps(_p, vl); + __riscv_vse16_v_f16m8(ptr, _p, vl); + + ptr += vl; + n -= vl; + } +#else // __riscv_zvfh + for (int i = 0; i < size; i++) + { + *ptr = (__fp16)tanh((float)*ptr); + ptr++; + } +#endif // __riscv_zvfh + } + + return 0; +} +#endif // NCNN_ZFH + +} // namespace ncnn diff --git a/src/layer/riscv/unaryop_riscv.cpp b/src/layer/riscv/unaryop_riscv.cpp index b6acf25e438..bd9ae699600 100644 --- a/src/layer/riscv/unaryop_riscv.cpp +++ b/src/layer/riscv/unaryop_riscv.cpp @@ -14,22 +14,29 @@ #include "unaryop_riscv.h" +#include + #if __riscv_vector #include #include "rvv_mathfun.h" -#include "rvv_mathfun_fp16s.h" #endif // __riscv_vector +#include "cpu.h" + namespace ncnn { UnaryOp_riscv::UnaryOp_riscv() { #if __riscv_vector support_packing = true; -#if __riscv_zfh - support_fp16_storage = true; -#endif #endif // __riscv_vector +#if NCNN_ZFH +#if __riscv_vector + support_fp16_storage = cpu_support_riscv_zvfh(); +#else + support_fp16_storage = cpu_support_riscv_zfh(); +#endif +#endif } #if __riscv_vector @@ -53,11 +60,11 @@ static int unary_op_inplace(Mat& a, const Option& opt) int n = size * elempack; while (n > 0) { - size_t vl = vsetvl_e32m8(n); + size_t vl = __riscv_vsetvl_e32m8(n); - vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); + vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr, vl); _p = op(_p, vl); - vse32_v_f32m8(ptr, _p, vl); + __riscv_vse32_v_f32m8(ptr, _p, vl); ptr += vl; n -= vl; @@ -73,7 +80,7 @@ struct unary_op_abs { vfloat32m8_t operator()(const vfloat32m8_t& x, const size_t& vl) const { - return vfsgnj_vf_f32m8(x, 1.f, vl); + return __riscv_vfsgnj_vf_f32m8(x, 1.f, vl); } }; @@ -81,7 +88,7 @@ struct unary_op_neg { vfloat32m8_t operator()(const vfloat32m8_t& x, const size_t& vl) const { - return vfneg_v_f32m8(x, vl); + return __riscv_vfneg_v_f32m8(x, vl); } }; @@ -89,9 +96,7 @@ struct unary_op_floor { vfloat32m8_t operator()(const vfloat32m8_t& x, const size_t& vl) const { - vint32m8_t _xi = vfcvt_x_f_v_i32m8(x, vl); - vbool4_t _mask = vmfgt_vv_f32m8_b4(vfcvt_f_x_v_f32m8(_xi, vl), x, vl); - return vfcvt_f_x_v_f32m8(vsub_vx_i32m8_m(_mask, _xi, _xi, 1, vl), vl); + return __riscv_vfcvt_f_x_v_f32m8(__riscv_vfcvt_x_f_v_i32m8_rm(x, __RISCV_FRM_RDN, vl), vl); } }; @@ -99,9 +104,7 @@ struct unary_op_ceil { vfloat32m8_t operator()(const vfloat32m8_t& x, const size_t& vl) const { - vint32m8_t _xi = vfcvt_x_f_v_i32m8(x, vl); - vbool4_t _mask = vmflt_vv_f32m8_b4(vfcvt_f_x_v_f32m8(_xi, vl), x, vl); - return vfcvt_f_x_v_f32m8(vadd_vx_i32m8_m(_mask, _xi, _xi, 1, vl), vl); + return __riscv_vfcvt_f_x_v_f32m8(__riscv_vfcvt_x_f_v_i32m8_rm(x, __RISCV_FRM_RUP, vl), vl); } }; @@ -109,7 +112,7 @@ struct unary_op_square { vfloat32m8_t operator()(const vfloat32m8_t& x, const size_t& vl) const { - return vfmul_vv_f32m8(x, x, vl); + return __riscv_vfmul_vv_f32m8(x, x, vl); } }; @@ -117,7 +120,7 @@ struct unary_op_sqrt { vfloat32m8_t operator()(const vfloat32m8_t& x, const size_t& vl) const { - return vfsqrt_v_f32m8(x, vl); + return __riscv_vfsqrt_v_f32m8(x, vl); } }; @@ -125,12 +128,12 @@ struct unary_op_rsqrt { vfloat32m8_t operator()(const vfloat32m8_t& x, const size_t& vl) const { -#if C906 - vfloat32m8_t _reciprocal = vfrdiv_vf_f32m8(vfsqrt_v_f32m8(x, vl), 1.f, vl); +#if __riscv_xtheadvector + vfloat32m8_t _reciprocal = __riscv_vfrdiv_vf_f32m8(__riscv_vfsqrt_v_f32m8(x, vl), 1.f, vl); #else - vfloat32m8_t _reciprocal = vfrsqrt7_v_f32m8(x, vl); - _reciprocal = vfmul_vv_f32m8(vfrsub_vf_f32m8(vfmul_vv_f32m8(vfmul_vf_f32m8(x, 0.5f, vl), vfmul_vv_f32m8(_reciprocal, _reciprocal, vl), vl), 1.5f, vl), _reciprocal, vl); - // _reciprocal = vfmul_vv_f32m8(vfrsub_vf_f32m8(vfmul_vv_f32m8(vfmul_vf_f32m8(x, 0.5f, vl), vfmul_vv_f32m8(_reciprocal, _reciprocal, vl), vl), 1.5f, vl), _reciprocal, vl); + vfloat32m8_t _reciprocal = __riscv_vfrsqrt7_v_f32m8(x, vl); + _reciprocal = __riscv_vfmul_vv_f32m8(__riscv_vfrsub_vf_f32m8(__riscv_vfmul_vv_f32m8(__riscv_vfmul_vf_f32m8(x, 0.5f, vl), __riscv_vfmul_vv_f32m8(_reciprocal, _reciprocal, vl), vl), 1.5f, vl), _reciprocal, vl); + // _reciprocal = __riscv_vfmul_vv_f32m8(__riscv_vfrsub_vf_f32m8(__riscv_vfmul_vv_f32m8(__riscv_vfmul_vf_f32m8(x, 0.5f, vl), __riscv_vfmul_vv_f32m8(_reciprocal, _reciprocal, vl), vl), 1.5f, vl), _reciprocal, vl); #endif return _reciprocal; } @@ -174,12 +177,12 @@ struct unary_op_tan { // TODO rvv optimize std::vector tmp(vl); - vse32_v_f32m8(tmp.data(), x, vl); + __riscv_vse32_v_f32m8(tmp.data(), x, vl); for (size_t i = 0; i < vl; i++) { tmp[i] = tanf(tmp[i]); } - return vle32_v_f32m8(tmp.data(), vl); + return __riscv_vle32_v_f32m8(tmp.data(), vl); } }; @@ -189,12 +192,12 @@ struct unary_op_asin { // TODO rvv optimize std::vector tmp(vl); - vse32_v_f32m8(tmp.data(), x, vl); + __riscv_vse32_v_f32m8(tmp.data(), x, vl); for (size_t i = 0; i < vl; i++) { tmp[i] = asinf(tmp[i]); } - return vle32_v_f32m8(tmp.data(), vl); + return __riscv_vle32_v_f32m8(tmp.data(), vl); } }; @@ -204,12 +207,12 @@ struct unary_op_acos { // TODO rvv optimize std::vector tmp(vl); - vse32_v_f32m8(tmp.data(), x, vl); + __riscv_vse32_v_f32m8(tmp.data(), x, vl); for (size_t i = 0; i < vl; i++) { tmp[i] = acosf(tmp[i]); } - return vle32_v_f32m8(tmp.data(), vl); + return __riscv_vle32_v_f32m8(tmp.data(), vl); } }; @@ -219,12 +222,12 @@ struct unary_op_atan { // TODO rvv optimize std::vector tmp(vl); - vse32_v_f32m8(tmp.data(), x, vl); + __riscv_vse32_v_f32m8(tmp.data(), x, vl); for (size_t i = 0; i < vl; i++) { tmp[i] = atanf(tmp[i]); } - return vle32_v_f32m8(tmp.data(), vl); + return __riscv_vle32_v_f32m8(tmp.data(), vl); } }; @@ -232,12 +235,12 @@ struct unary_op_reciprocal { vfloat32m8_t operator()(const vfloat32m8_t& x, const size_t& vl) const { -#if C906 - vfloat32m8_t _reciprocal = vfrdiv_vf_f32m8(x, 1.f, vl); +#if __riscv_xtheadvector + vfloat32m8_t _reciprocal = __riscv_vfrdiv_vf_f32m8(x, 1.f, vl); #else - vfloat32m8_t _reciprocal = vfrec7_v_f32m8(x, vl); - _reciprocal = vfmul_vv_f32m8(vfrsub_vf_f32m8(vfmul_vv_f32m8(x, _reciprocal, vl), 2.f, vl), _reciprocal, vl); - // _reciprocal = vfmul_vv_f32m8(vfrsub_vf_f32m8(vfmul_vv_f32m8(x, _reciprocal, vl), 2.f, vl), _reciprocal, vl); + vfloat32m8_t _reciprocal = __riscv_vfrec7_v_f32m8(x, vl); + _reciprocal = __riscv_vfmul_vv_f32m8(__riscv_vfrsub_vf_f32m8(__riscv_vfmul_vv_f32m8(x, _reciprocal, vl), 2.f, vl), _reciprocal, vl); + // _reciprocal = __riscv_vfmul_vv_f32m8(__riscv_vfrsub_vf_f32m8(__riscv_vfmul_vv_f32m8(x, _reciprocal, vl), 2.f, vl), _reciprocal, vl); #endif return _reciprocal; } @@ -255,7 +258,7 @@ struct unary_op_log10 { vfloat32m8_t operator()(const vfloat32m8_t& x, const size_t& vl) const { - return vfmul_vf_f32m8(log_ps(x, vl), 0.434294481903, vl); + return __riscv_vfmul_vf_f32m8(log_ps(x, vl), 0.434294481903, vl); } }; @@ -263,7 +266,7 @@ struct unary_op_round { vfloat32m8_t operator()(const vfloat32m8_t& x, const size_t& vl) const { - return vfcvt_f_x_v_f32m8(vfcvt_x_f_v_i32m8(x, vl), vl); + return __riscv_vfcvt_f_x_v_f32m8(__riscv_vfcvt_x_f_v_i32m8(x, vl), vl); } }; @@ -271,22 +274,22 @@ struct unary_op_trunc { vfloat32m8_t operator()(const vfloat32m8_t& x, const size_t& vl) const { -#if C906 +#if __riscv_xtheadvector // simulate trunc with floor positives and ceil negative // xi = round(x) // floorx = xi - (xi > x) // ceilx = xi + (xi < x) // truncx = x >= 0 ? floorx : ceilx - vint32m8_t _xi = vfcvt_x_f_v_i32m8(x, vl); - vfloat32m8_t _xf = vfcvt_f_x_v_f32m8(_xi, vl); - vbool4_t _floormask = vmfgt_vv_f32m8_b4(_xf, x, vl); - vint32m8_t _floorx = vsub_vx_i32m8_m(_floormask, _xi, _xi, 1, vl); - vbool4_t _ceilmask = vmflt_vv_f32m8_b4(_xf, x, vl); - vint32m8_t _ceilx = vadd_vx_i32m8_m(_ceilmask, _xi, _xi, 1, vl); - vbool4_t _negative = vmflt_vf_f32m8_b4(x, 0.f, vl); - return vfcvt_f_x_v_f32m8(vmerge_vvm_i32m8(_negative, _floorx, _ceilx, vl), vl); + vint32m8_t _xi = __riscv_vfcvt_x_f_v_i32m8(x, vl); + vfloat32m8_t _xf = __riscv_vfcvt_f_x_v_f32m8(_xi, vl); + vbool4_t _floormask = __riscv_vmfgt_vv_f32m8_b4(_xf, x, vl); + vint32m8_t _floorx = __riscv_vsub_vx_i32m8_mu(_floormask, _xi, _xi, 1, vl); + vbool4_t _ceilmask = __riscv_vmflt_vv_f32m8_b4(_xf, x, vl); + vint32m8_t _ceilx = __riscv_vadd_vx_i32m8_mu(_ceilmask, _xi, _xi, 1, vl); + vbool4_t _negative = __riscv_vmflt_vf_f32m8_b4(x, 0.f, vl); + return __riscv_vfcvt_f_x_v_f32m8(__riscv_vmerge_vvm_i32m8(_floorx, _ceilx, _negative, vl), vl); #else - return vfcvt_f_x_v_f32m8(vfcvt_rtz_x_f_v_i32m8(x, vl), vl); + return __riscv_vfcvt_f_x_v_f32m8(__riscv_vfcvt_rtz_x_f_v_i32m8(x, vl), vl); #endif } }; @@ -296,9 +299,9 @@ struct unary_op_trunc int UnaryOp_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const { +#if NCNN_ZFH int elembits = bottom_top_blob.elembits(); -#if __riscv_vector && __riscv_zfh if (opt.use_fp16_storage && elembits == 16) return forward_inplace_fp16s(bottom_top_blob, opt); #endif @@ -361,344 +364,26 @@ int UnaryOp_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons return unary_op_inplace(bottom_top_blob, opt); if (op_type == Operation_ROUND) - return unary_op_inplace(bottom_top_blob, opt); - - if (op_type == Operation_TRUNC) - return unary_op_inplace(bottom_top_blob, opt); - - return 0; -#else // __riscv_vector - return UnaryOp::forward_inplace(bottom_top_blob, opt); -#endif // __riscv_vector -} - -#if __riscv_vector && __riscv_zfh -template -static int unary_op_inplace_fp16s(Mat& a, const Option& opt) -{ - Op op; - - int w = a.w; - int h = a.h; - int d = a.d; - int channels = a.c; - int size = w * h * d; - int elempack = a.elempack; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - __fp16* ptr = a.channel(q); - - int n = size * elempack; - while (n > 0) - { - size_t vl = vsetvl_e16m8(n); - - vfloat16m8_t _p = vle16_v_f16m8(ptr, vl); - _p = op(_p, vl); - vse16_v_f16m8(ptr, _p, vl); - - ptr += vl; - n -= vl; - } - } - - return 0; -} - -namespace UnaryOp_riscv_functor { - -struct unary_op_abs_fp16s -{ - vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const - { - return vfsgnj_vf_f16m8(x, 1.f, vl); - } -}; - -struct unary_op_neg_fp16s -{ - vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const - { - return vfneg_v_f16m8(x, vl); - } -}; - -struct unary_op_floor_fp16s -{ - vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const - { - vint16m8_t _xi = vfcvt_x_f_v_i16m8(x, vl); - vbool2_t _mask = vmfgt_vv_f16m8_b2(vfcvt_f_x_v_f16m8(_xi, vl), x, vl); - return vfcvt_f_x_v_f16m8(vsub_vx_i16m8_m(_mask, _xi, _xi, 1, vl), vl); - } -}; - -struct unary_op_ceil_fp16s -{ - vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const - { - vint16m8_t _xi = vfcvt_x_f_v_i16m8(x, vl); - vbool2_t _mask = vmflt_vv_f16m8_b2(vfcvt_f_x_v_f16m8(_xi, vl), x, vl); - return vfcvt_f_x_v_f16m8(vadd_vx_i16m8_m(_mask, _xi, _xi, 1, vl), vl); - } -}; - -struct unary_op_square_fp16s -{ - vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const - { - return vfmul_vv_f16m8(x, x, vl); - } -}; - -struct unary_op_sqrt_fp16s -{ - vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const - { - return vfsqrt_v_f16m8(x, vl); - } -}; - -struct unary_op_rsqrt_fp16s -{ - vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const - { -#if C906 - vfloat16m8_t _reciprocal = vfrdiv_vf_f16m8(vfsqrt_v_f16m8(x, vl), 1.f, vl); -#else - vfloat16m8_t _reciprocal = vfrsqrt7_v_f16m8(x, vl); - _reciprocal = vfmul_vv_f16m8(vfrsub_vf_f16m8(vfmul_vv_f16m8(vfmul_vf_f16m8(x, 0.5f, vl), vfmul_vv_f16m8(_reciprocal, _reciprocal, vl), vl), 1.5f, vl), _reciprocal, vl); - // _reciprocal = vfmul_vv_f16m8(vfrsub_vf_f16m8(vfmul_vv_f16m8(vfmul_vf_f16m8(x, 0.5f, vl), vfmul_vv_f16m8(_reciprocal, _reciprocal, vl), vl), 1.5f, vl), _reciprocal, vl); -#endif - return _reciprocal; - } -}; - -struct unary_op_exp_fp16s -{ - vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const - { - return exp_ps(x, vl); - } -}; - -struct unary_op_log_fp16s -{ - vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const - { - return log_ps(x, vl); - } -}; - -struct unary_op_sin_fp16s -{ - vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const - { - return sin_ps(x, vl); - } -}; - -struct unary_op_cos_fp16s -{ - vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const - { - return cos_ps(x, vl); - } -}; - -struct unary_op_tan_fp16s -{ - vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const - { - // TODO rvv optimize - std::vector<__fp16> tmp(vl); - vse16_v_f16m8(tmp.data(), x, vl); - for (size_t i = 0; i < vl; i++) - { - tmp[i] = tanf((float)tmp[i]); - } - return vle16_v_f16m8(tmp.data(), vl); - } -}; - -struct unary_op_asin_fp16s -{ - vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const - { - // TODO rvv optimize - std::vector<__fp16> tmp(vl); - vse16_v_f16m8(tmp.data(), x, vl); - for (size_t i = 0; i < vl; i++) - { - tmp[i] = asinf((float)tmp[i]); - } - return vle16_v_f16m8(tmp.data(), vl); - } -}; - -struct unary_op_acos_fp16s -{ - vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const - { - // TODO rvv optimize - std::vector<__fp16> tmp(vl); - vse16_v_f16m8(tmp.data(), x, vl); - for (size_t i = 0; i < vl; i++) - { - tmp[i] = acosf((float)tmp[i]); - } - return vle16_v_f16m8(tmp.data(), vl); - } -}; - -struct unary_op_atan_fp16s -{ - vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const - { - // TODO rvv optimize - std::vector<__fp16> tmp(vl); - vse16_v_f16m8(tmp.data(), x, vl); - for (size_t i = 0; i < vl; i++) - { - tmp[i] = atanf((float)tmp[i]); - } - return vle16_v_f16m8(tmp.data(), vl); - } -}; - -struct unary_op_reciprocal_fp16s -{ - vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const { -#if C906 - vfloat16m8_t _reciprocal = vfrdiv_vf_f16m8(x, 1.f, vl); -#else - vfloat16m8_t _reciprocal = vfrec7_v_f16m8(x, vl); - _reciprocal = vfmul_vv_f16m8(vfrsub_vf_f16m8(vfmul_vv_f16m8(x, _reciprocal, vl), 2.f, vl), _reciprocal, vl); - // _reciprocal = vfmul_vv_f16m8(vfrsub_vf_f16m8(vfmul_vv_f16m8(x, _reciprocal, vl), 2.f, vl), _reciprocal, vl); + // round to nearest even +#ifdef FE_TONEAREST + int old_rm = fegetround(); + fesetround(FE_TONEAREST); #endif - return _reciprocal; - } -}; - -struct unary_op_tanh_fp16s -{ - vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const - { - return tanh_ps(x, vl); - } -}; - -struct unary_op_log10_fp16s -{ - vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const - { - return vfmul_vf_f16m8(log_ps(x, vl), 0.434294481903, vl); - } -}; - -struct unary_op_round_fp16s -{ - vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const - { - return vfcvt_f_x_v_f16m8(vfcvt_x_f_v_i16m8(x, vl), vl); - } -}; - -struct unary_op_trunc_fp16s -{ - vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const - { -#if C906 - // simulate trunc with floor positives and ceil negative - // xi = round(x) - // floorx = xi - (xi > x) - // ceilx = xi + (xi < x) - // truncx = x >= 0 ? floorx : ceilx - vint16m8_t _xi = vfcvt_x_f_v_i16m8(x, vl); - vfloat16m8_t _xf = vfcvt_f_x_v_f16m8(_xi, vl); - vbool2_t _floormask = vmfgt_vv_f16m8_b2(_xf, x, vl); - vint16m8_t _floorx = vsub_vx_i16m8_m(_floormask, _xi, _xi, 1, vl); - vbool2_t _ceilmask = vmflt_vv_f16m8_b2(_xf, x, vl); - vint16m8_t _ceilx = vadd_vx_i16m8_m(_ceilmask, _xi, _xi, 1, vl); - vbool2_t _negative = vmflt_vf_f16m8_b2(x, 0.f, vl); - return vfcvt_f_x_v_f16m8(vmerge_vvm_i16m8(_negative, _floorx, _ceilx, vl), vl); -#else - return vfcvt_f_x_v_f16m8(vfcvt_rtz_x_f_v_i16m8(x, vl), vl); + int ret = unary_op_inplace(bottom_top_blob, opt); +#ifdef FE_TONEAREST + fesetround(old_rm); #endif + return ret; } -}; - -} // namespace UnaryOp_riscv_functor - -int UnaryOp_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const -{ - using namespace UnaryOp_riscv_functor; - - if (op_type == Operation_ABS) - return unary_op_inplace_fp16s(bottom_top_blob, opt); - - if (op_type == Operation_NEG) - return unary_op_inplace_fp16s(bottom_top_blob, opt); - - if (op_type == Operation_FLOOR) - return unary_op_inplace_fp16s(bottom_top_blob, opt); - - if (op_type == Operation_CEIL) - return unary_op_inplace_fp16s(bottom_top_blob, opt); - - if (op_type == Operation_SQUARE) - return unary_op_inplace_fp16s(bottom_top_blob, opt); - - if (op_type == Operation_SQRT) - return unary_op_inplace_fp16s(bottom_top_blob, opt); - - if (op_type == Operation_RSQRT) - return unary_op_inplace_fp16s(bottom_top_blob, opt); - - if (op_type == Operation_EXP) - return unary_op_inplace_fp16s(bottom_top_blob, opt); - - if (op_type == Operation_LOG) - return unary_op_inplace_fp16s(bottom_top_blob, opt); - - if (op_type == Operation_SIN) - return unary_op_inplace_fp16s(bottom_top_blob, opt); - - if (op_type == Operation_COS) - return unary_op_inplace_fp16s(bottom_top_blob, opt); - - if (op_type == Operation_TAN) - return unary_op_inplace_fp16s(bottom_top_blob, opt); - - if (op_type == Operation_ASIN) - return unary_op_inplace_fp16s(bottom_top_blob, opt); - - if (op_type == Operation_ACOS) - return unary_op_inplace_fp16s(bottom_top_blob, opt); - - if (op_type == Operation_ATAN) - return unary_op_inplace_fp16s(bottom_top_blob, opt); - - if (op_type == Operation_RECIPROCAL) - return unary_op_inplace_fp16s(bottom_top_blob, opt); - - if (op_type == Operation_TANH) - return unary_op_inplace_fp16s(bottom_top_blob, opt); - - if (op_type == Operation_LOG10) - return unary_op_inplace_fp16s(bottom_top_blob, opt); - - if (op_type == Operation_ROUND) - return unary_op_inplace_fp16s(bottom_top_blob, opt); if (op_type == Operation_TRUNC) - return unary_op_inplace_fp16s(bottom_top_blob, opt); + return unary_op_inplace(bottom_top_blob, opt); return 0; +#else // __riscv_vector + return UnaryOp::forward_inplace(bottom_top_blob, opt); +#endif // __riscv_vector } -#endif // __riscv_vector && __riscv_zfh } // namespace ncnn diff --git a/src/layer/riscv/unaryop_riscv.h b/src/layer/riscv/unaryop_riscv.h index 215ad3426a4..c3db29bb4aa 100644 --- a/src/layer/riscv/unaryop_riscv.h +++ b/src/layer/riscv/unaryop_riscv.h @@ -27,7 +27,7 @@ class UnaryOp_riscv : public UnaryOp virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; protected: -#if __riscv_vector && __riscv_zfh +#if NCNN_ZFH int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const; #endif }; diff --git a/src/layer/riscv/unaryop_riscv_zfh.cpp b/src/layer/riscv/unaryop_riscv_zfh.cpp new file mode 100644 index 00000000000..07ec0d16e15 --- /dev/null +++ b/src/layer/riscv/unaryop_riscv_zfh.cpp @@ -0,0 +1,512 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "unaryop_riscv.h" + +#if __riscv_vector +#include +#include "rvv_mathfun.h" +#if __riscv_zvfh +#include "rvv_mathfun_fp16s.h" +#endif +#endif // __riscv_vector + +#include + +namespace ncnn { + +#if NCNN_ZFH +template +static int unary_op_inplace_fp16s(Mat& a, const Option& opt) +{ + Op op; + + int w = a.w; + int h = a.h; + int d = a.d; + int channels = a.c; + int size = w * h * d; + int elempack = a.elempack; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + __fp16* ptr = a.channel(q); + +#if __riscv_zvfh + int n = size * elempack; + while (n > 0) + { + size_t vl = __riscv_vsetvl_e16m8(n); + + vfloat16m8_t _p = __riscv_vle16_v_f16m8(ptr, vl); + _p = op(_p, vl); + __riscv_vse16_v_f16m8(ptr, _p, vl); + + ptr += vl; + n -= vl; + } +#else // __riscv_zvfh + for (int i = 0; i < size; i++) + { + ptr[i] = op(ptr[i]); + } +#endif // __riscv_zvfh + } + + return 0; +} + +namespace UnaryOp_riscv_functor { + +struct unary_op_abs_fp16s +{ +#if __riscv_zvfh + vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const + { + return __riscv_vfsgnj_vf_f16m8(x, (__fp16)1.f, vl); + } +#else // __riscv_zvfh + __fp16 operator()(const __fp16& x) const + { + return (__fp16)fabsf((float)x); + } +#endif // __riscv_zvfh +}; + +struct unary_op_neg_fp16s +{ +#if __riscv_zvfh + vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const + { + return __riscv_vfneg_v_f16m8(x, vl); + } +#else // __riscv_zvfh + __fp16 operator()(const __fp16& x) const + { + return -x; + } +#endif // __riscv_zvfh +}; + +struct unary_op_floor_fp16s +{ +#if __riscv_zvfh + vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const + { + return __riscv_vfcvt_f_x_v_f16m8(__riscv_vfcvt_x_f_v_i16m8_rm(x, __RISCV_FRM_RDN, vl), vl); + } +#else // __riscv_zvfh + __fp16 operator()(const __fp16& x) const + { + return (__fp16)floorf((float)x); + } +#endif // __riscv_zvfh +}; + +struct unary_op_ceil_fp16s +{ +#if __riscv_zvfh + vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const + { + return __riscv_vfcvt_f_x_v_f16m8(__riscv_vfcvt_x_f_v_i16m8_rm(x, __RISCV_FRM_RUP, vl), vl); + } +#else // __riscv_zvfh + __fp16 operator()(const __fp16& x) const + { + return (__fp16)ceilf((float)x); + } +#endif // __riscv_zvfh +}; + +struct unary_op_square_fp16s +{ +#if __riscv_zvfh + vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const + { + return __riscv_vfmul_vv_f16m8(x, x, vl); + } +#else // __riscv_zvfh + __fp16 operator()(const __fp16& x) const + { + return x * x; + } +#endif // __riscv_zvfh +}; + +struct unary_op_sqrt_fp16s +{ +#if __riscv_zvfh + vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const + { + return __riscv_vfsqrt_v_f16m8(x, vl); + } +#else // __riscv_zvfh + __fp16 operator()(const __fp16& x) const + { + return (__fp16)sqrtf((float)x); + } +#endif // __riscv_zvfh +}; + +struct unary_op_rsqrt_fp16s +{ +#if __riscv_zvfh + vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const + { +#if __riscv_xtheadvector + vfloat16m8_t _reciprocal = __riscv_vfrdiv_vf_f16m8(__riscv_vfsqrt_v_f16m8(x, vl), (__fp16)1.f, vl); +#else + vfloat16m8_t _reciprocal = __riscv_vfrsqrt7_v_f16m8(x, vl); + _reciprocal = __riscv_vfmul_vv_f16m8(__riscv_vfrsub_vf_f16m8(__riscv_vfmul_vv_f16m8(__riscv_vfmul_vf_f16m8(x, (__fp16)0.5f, vl), __riscv_vfmul_vv_f16m8(_reciprocal, _reciprocal, vl), vl), (__fp16)1.5f, vl), _reciprocal, vl); + // _reciprocal = __riscv_vfmul_vv_f16m8(__riscv_vfrsub_vf_f16m8(__riscv_vfmul_vv_f16m8(__riscv_vfmul_vf_f16m8(x, (__fp16)0.5f, vl), __riscv_vfmul_vv_f16m8(_reciprocal, _reciprocal, vl), vl), (__fp16)1.5f, vl), _reciprocal, vl); +#endif + return _reciprocal; + } +#else // __riscv_zvfh + __fp16 operator()(const __fp16& x) const + { + return (__fp16)(1.f / sqrtf((float)x)); + } +#endif // __riscv_zvfh +}; + +struct unary_op_exp_fp16s +{ +#if __riscv_zvfh + vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const + { + return exp_ps(x, vl); + } +#else // __riscv_zvfh + __fp16 operator()(const __fp16& x) const + { + return (__fp16)expf((float)x); + } +#endif // __riscv_zvfh +}; + +struct unary_op_log_fp16s +{ +#if __riscv_zvfh + vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const + { + return log_ps(x, vl); + } +#else // __riscv_zvfh + __fp16 operator()(const __fp16& x) const + { + return (__fp16)logf((float)x); + } +#endif // __riscv_zvfh +}; + +struct unary_op_sin_fp16s +{ +#if __riscv_zvfh + vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const + { + return sin_ps(x, vl); + } +#else // __riscv_zvfh + __fp16 operator()(const __fp16& x) const + { + return (__fp16)sinf((float)x); + } +#endif // __riscv_zvfh +}; + +struct unary_op_cos_fp16s +{ +#if __riscv_zvfh + vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const + { + return cos_ps(x, vl); + } +#else // __riscv_zvfh + __fp16 operator()(const __fp16& x) const + { + return (__fp16)cosf((float)x); + } +#endif // __riscv_zvfh +}; + +struct unary_op_tan_fp16s +{ +#if __riscv_zvfh + vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const + { + // TODO rvv optimize + std::vector<__fp16> tmp(vl); + __riscv_vse16_v_f16m8(tmp.data(), x, vl); + for (size_t i = 0; i < vl; i++) + { + tmp[i] = (__fp16)tanf((float)tmp[i]); + } + return __riscv_vle16_v_f16m8(tmp.data(), vl); + } +#else // __riscv_zvfh + __fp16 operator()(const __fp16& x) const + { + return (__fp16)tanf((float)x); + } +#endif // __riscv_zvfh +}; + +struct unary_op_asin_fp16s +{ +#if __riscv_zvfh + vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const + { + // TODO rvv optimize + std::vector<__fp16> tmp(vl); + __riscv_vse16_v_f16m8(tmp.data(), x, vl); + for (size_t i = 0; i < vl; i++) + { + tmp[i] = (__fp16)asinf((float)tmp[i]); + } + return __riscv_vle16_v_f16m8(tmp.data(), vl); + } +#else // __riscv_zvfh + __fp16 operator()(const __fp16& x) const + { + return (__fp16)asin((float)x); + } +#endif // __riscv_zvfh +}; + +struct unary_op_acos_fp16s +{ +#if __riscv_zvfh + vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const + { + // TODO rvv optimize + std::vector<__fp16> tmp(vl); + __riscv_vse16_v_f16m8(tmp.data(), x, vl); + for (size_t i = 0; i < vl; i++) + { + tmp[i] = (__fp16)acosf((float)tmp[i]); + } + return __riscv_vle16_v_f16m8(tmp.data(), vl); + } +#else // __riscv_zvfh + __fp16 operator()(const __fp16& x) const + { + return (__fp16)acos((float)x); + } +#endif // __riscv_zvfh +}; + +struct unary_op_atan_fp16s +{ +#if __riscv_zvfh + vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const + { + // TODO rvv optimize + std::vector<__fp16> tmp(vl); + __riscv_vse16_v_f16m8(tmp.data(), x, vl); + for (size_t i = 0; i < vl; i++) + { + tmp[i] = (__fp16)atanf((float)tmp[i]); + } + return __riscv_vle16_v_f16m8(tmp.data(), vl); + } +#else // __riscv_zvfh + __fp16 operator()(const __fp16& x) const + { + return (__fp16)atan((float)x); + } +#endif // __riscv_zvfh +}; + +struct unary_op_reciprocal_fp16s +{ +#if __riscv_zvfh + vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const + { +#if __riscv_xtheadvector + vfloat16m8_t _reciprocal = __riscv_vfrdiv_vf_f16m8(x, (__fp16)1.f, vl); +#else + vfloat16m8_t _reciprocal = __riscv_vfrec7_v_f16m8(x, vl); + _reciprocal = __riscv_vfmul_vv_f16m8(__riscv_vfrsub_vf_f16m8(__riscv_vfmul_vv_f16m8(x, _reciprocal, vl), (__fp16)2.f, vl), _reciprocal, vl); + // _reciprocal = __riscv_vfmul_vv_f16m8(__riscv_vfrsub_vf_f16m8(__riscv_vfmul_vv_f16m8(x, _reciprocal, vl), (__fp16)2.f, vl), _reciprocal, vl); +#endif + return _reciprocal; + } +#else // __riscv_zvfh + __fp16 operator()(const __fp16& x) const + { + return (__fp16)1.f / x; + } +#endif // __riscv_zvfh +}; + +struct unary_op_tanh_fp16s +{ +#if __riscv_zvfh + vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const + { + return tanh_ps(x, vl); + } +#else // __riscv_zvfh + __fp16 operator()(const __fp16& x) const + { + return (__fp16)tanhf((float)x); + } +#endif // __riscv_zvfh +}; + +struct unary_op_log10_fp16s +{ +#if __riscv_zvfh + vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const + { + return __riscv_vfmul_vf_f16m8(log_ps(x, vl), (__fp16)0.434294481903, vl); + } +#else // __riscv_zvfh + __fp16 operator()(const __fp16& x) const + { + return (__fp16)log10f((float)x); + } +#endif // __riscv_zvfh +}; + +struct unary_op_round_fp16s +{ +#if __riscv_zvfh + vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const + { + return __riscv_vfcvt_f_x_v_f16m8(__riscv_vfcvt_x_f_v_i16m8(x, vl), vl); + } +#else // __riscv_zvfh + __fp16 operator()(const __fp16& x) const + { + return (__fp16)nearbyintf((float)x); + } +#endif // __riscv_zvfh +}; + +struct unary_op_trunc_fp16s +{ +#if __riscv_zvfh + vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const + { +#if __riscv_xtheadvector + // simulate trunc with floor positives and ceil negative + // xi = round(x) + // floorx = xi - (xi > x) + // ceilx = xi + (xi < x) + // truncx = x >= 0 ? floorx : ceilx + vint16m8_t _xi = __riscv_vfcvt_x_f_v_i16m8(x, vl); + vfloat16m8_t _xf = __riscv_vfcvt_f_x_v_f16m8(_xi, vl); + vbool2_t _floormask = __riscv_vmfgt_vv_f16m8_b2(_xf, x, vl); + vint16m8_t _floorx = __riscv_vsub_vx_i16m8_mu(_floormask, _xi, _xi, 1, vl); + vbool2_t _ceilmask = __riscv_vmflt_vv_f16m8_b2(_xf, x, vl); + vint16m8_t _ceilx = __riscv_vadd_vx_i16m8_mu(_ceilmask, _xi, _xi, 1, vl); + vbool2_t _negative = __riscv_vmflt_vf_f16m8_b2(x, (__fp16)0.f, vl); + return __riscv_vfcvt_f_x_v_f16m8(__riscv_vmerge_vvm_i16m8(_floorx, _ceilx, _negative, vl), vl); +#else + return __riscv_vfcvt_f_x_v_f16m8(__riscv_vfcvt_rtz_x_f_v_i16m8(x, vl), vl); +#endif + } +#else // __riscv_zvfh + __fp16 operator()(const __fp16& x) const + { + return (__fp16)truncf((float)x); + } +#endif // __riscv_zvfh +}; + +} // namespace UnaryOp_riscv_functor + +int UnaryOp_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const +{ + using namespace UnaryOp_riscv_functor; + + if (op_type == Operation_ABS) + return unary_op_inplace_fp16s(bottom_top_blob, opt); + + if (op_type == Operation_NEG) + return unary_op_inplace_fp16s(bottom_top_blob, opt); + + if (op_type == Operation_FLOOR) + return unary_op_inplace_fp16s(bottom_top_blob, opt); + + if (op_type == Operation_CEIL) + return unary_op_inplace_fp16s(bottom_top_blob, opt); + + if (op_type == Operation_SQUARE) + return unary_op_inplace_fp16s(bottom_top_blob, opt); + + if (op_type == Operation_SQRT) + return unary_op_inplace_fp16s(bottom_top_blob, opt); + + if (op_type == Operation_RSQRT) + return unary_op_inplace_fp16s(bottom_top_blob, opt); + + if (op_type == Operation_EXP) + return unary_op_inplace_fp16s(bottom_top_blob, opt); + + if (op_type == Operation_LOG) + return unary_op_inplace_fp16s(bottom_top_blob, opt); + + if (op_type == Operation_SIN) + return unary_op_inplace_fp16s(bottom_top_blob, opt); + + if (op_type == Operation_COS) + return unary_op_inplace_fp16s(bottom_top_blob, opt); + + if (op_type == Operation_TAN) + return unary_op_inplace_fp16s(bottom_top_blob, opt); + + if (op_type == Operation_ASIN) + return unary_op_inplace_fp16s(bottom_top_blob, opt); + + if (op_type == Operation_ACOS) + return unary_op_inplace_fp16s(bottom_top_blob, opt); + + if (op_type == Operation_ATAN) + return unary_op_inplace_fp16s(bottom_top_blob, opt); + + if (op_type == Operation_RECIPROCAL) + return unary_op_inplace_fp16s(bottom_top_blob, opt); + + if (op_type == Operation_TANH) + return unary_op_inplace_fp16s(bottom_top_blob, opt); + + if (op_type == Operation_LOG10) + return unary_op_inplace_fp16s(bottom_top_blob, opt); + + if (op_type == Operation_ROUND) + { + // round to nearest even +#ifdef FE_TONEAREST + int old_rm = fegetround(); + fesetround(FE_TONEAREST); +#endif + int ret = unary_op_inplace_fp16s(bottom_top_blob, opt); +#ifdef FE_TONEAREST + fesetround(old_rm); +#endif + return ret; + } + + if (op_type == Operation_TRUNC) + return unary_op_inplace_fp16s(bottom_top_blob, opt); + + return 0; +} +#endif // NCNN_ZFH + +} // namespace ncnn diff --git a/src/layer/split.cpp b/src/layer/split.cpp index 996624dfe7a..b5b24f8b3ad 100644 --- a/src/layer/split.cpp +++ b/src/layer/split.cpp @@ -22,7 +22,7 @@ Split::Split() one_blob_only = false; support_inplace = false; support_packing = true; - support_fp16_storage = cpu_support_arm_asimdhp() || cpu_support_riscv_zfh(); + support_fp16_storage = cpu_support_arm_asimdhp() || cpu_support_riscv_zvfh(); support_bf16_storage = true; } diff --git a/src/layer/unaryop.cpp b/src/layer/unaryop.cpp index b05add15cfb..28447f171e7 100644 --- a/src/layer/unaryop.cpp +++ b/src/layer/unaryop.cpp @@ -196,16 +196,7 @@ struct unary_op_round { float operator()(const float& x) const { - // round to nearest even -#ifdef FE_TONEAREST - int old_rm = fegetround(); - fesetround(FE_TONEAREST); -#endif - float y = nearbyintf(x); -#ifdef FE_TONEAREST - fesetround(old_rm); -#endif - return y; + return nearbyintf(x); } }; @@ -274,7 +265,18 @@ int UnaryOp::forward_inplace(Mat& bottom_top_blob, const Option& opt) const return unary_op_inplace(bottom_top_blob, opt); if (op_type == Operation_ROUND) - return unary_op_inplace(bottom_top_blob, opt); + { + // round to nearest even +#ifdef FE_TONEAREST + int old_rm = fegetround(); + fesetround(FE_TONEAREST); +#endif + int ret = unary_op_inplace(bottom_top_blob, opt); +#ifdef FE_TONEAREST + fesetround(old_rm); +#endif + return ret; + } if (op_type == Operation_TRUNC) return unary_op_inplace(bottom_top_blob, opt); diff --git a/src/layer/x86/unaryop_x86.cpp b/src/layer/x86/unaryop_x86.cpp index e634328d4cb..ca1e26bdc8f 100644 --- a/src/layer/x86/unaryop_x86.cpp +++ b/src/layer/x86/unaryop_x86.cpp @@ -575,17 +575,8 @@ struct unary_op_round { NCNN_FORCEINLINE float func(const float& x) const { - // round to nearest even // return (x + 12582912.f) - 12582912.f; -#ifdef FE_TONEAREST - int old_rm = fegetround(); - fesetround(FE_TONEAREST); -#endif - float y = nearbyintf(x); -#ifdef FE_TONEAREST - fesetround(old_rm); -#endif - return y; + return nearbyintf(x); } #if __SSE2__ NCNN_FORCEINLINE __m128 func_pack4(const __m128& x) const @@ -701,7 +692,18 @@ int UnaryOp_x86::forward_inplace(Mat& bottom_top_blob, const Option& opt) const return unary_op_inplace(bottom_top_blob, opt); if (op_type == Operation_ROUND) - return unary_op_inplace(bottom_top_blob, opt); + { + // round to nearest even +#ifdef FE_TONEAREST + int old_rm = fegetround(); + fesetround(FE_TONEAREST); +#endif + int ret = unary_op_inplace(bottom_top_blob, opt); +#ifdef FE_TONEAREST + fesetround(old_rm); +#endif + return ret; + } if (op_type == Operation_TRUNC) return unary_op_inplace(bottom_top_blob, opt); diff --git a/src/layer_registry.h.in b/src/layer_registry.h.in index dfe8e73ce79..6f34f8cb0f0 100644 --- a/src/layer_registry.h.in +++ b/src/layer_registry.h.in @@ -52,6 +52,12 @@ static const layer_registry_entry layer_registry_rvv[] = { }; #endif // NCNN_RUNTIME_CPU && NCNN_RVV +#if NCNN_RUNTIME_CPU && NCNN_XTHEADVECTOR +static const layer_registry_entry layer_registry_xtheadvector[] = { +@layer_registry_xtheadvector@ +}; +#endif // NCNN_RUNTIME_CPU && NCNN_XTHEADVECTOR + #if NCNN_VULKAN static const layer_registry_entry layer_registry_vulkan[] = { @layer_registry_vulkan@ diff --git a/src/mat.h b/src/mat.h index fdf5cc597c4..1cbce635bbd 100644 --- a/src/mat.h +++ b/src/mat.h @@ -138,9 +138,9 @@ class NCNN_EXPORT Mat void fill(vfloat32m1_t _v); void fill(vuint16m1_t _v); void fill(vint8m1_t _v); -#if __riscv_zfh +#if __riscv_zvfh void fill(vfloat16m1_t _v); -#endif // __riscv_zfh +#endif // __riscv_zvfh #endif // __riscv_vector template void fill(T v); @@ -1089,17 +1089,18 @@ NCNN_FORCEINLINE void Mat::fill(__m128 _v) } } #endif // __loongarch_sx + #if __riscv_vector NCNN_FORCEINLINE void Mat::fill(vfloat32m1_t _v) { const int packn = cpu_riscv_vlenb() / 4; - const size_t vl = vsetvl_e32m1(packn); + const size_t vl = __riscv_vsetvl_e32m1(packn); int size = (int)total(); float* ptr = (float*)data; for (int i = 0; i < size; i++) { - vse32_v_f32m1(ptr, _v, vl); + __riscv_vse32_v_f32m1(ptr, _v, vl); ptr += packn; } } @@ -1107,13 +1108,13 @@ NCNN_FORCEINLINE void Mat::fill(vfloat32m1_t _v) NCNN_FORCEINLINE void Mat::fill(vuint16m1_t _v) { const int packn = cpu_riscv_vlenb() / 2; - const size_t vl = vsetvl_e16m1(packn); + const size_t vl = __riscv_vsetvl_e16m1(packn); int size = (int)total(); unsigned short* ptr = (unsigned short*)data; for (int i = 0; i < size; i++) { - vse16_v_u16m1(ptr, _v, vl); + __riscv_vse16_v_u16m1(ptr, _v, vl); ptr += packn; } } @@ -1121,31 +1122,31 @@ NCNN_FORCEINLINE void Mat::fill(vuint16m1_t _v) NCNN_FORCEINLINE void Mat::fill(vint8m1_t _v) { const int packn = cpu_riscv_vlenb() / 1; - const size_t vl = vsetvl_e8m1(packn); + const size_t vl = __riscv_vsetvl_e8m1(packn); int size = (int)total(); signed char* ptr = (signed char*)data; for (int i = 0; i < size; i++) { - vse8_v_i8m1(ptr, _v, vl); + __riscv_vse8_v_i8m1(ptr, _v, vl); ptr += packn; } } -#if __riscv_zfh +#if __riscv_zvfh NCNN_FORCEINLINE void Mat::fill(vfloat16m1_t _v) { const int packn = cpu_riscv_vlenb() / 2; - const size_t vl = vsetvl_e16m1(packn); + const size_t vl = __riscv_vsetvl_e16m1(packn); int size = (int)total(); __fp16* ptr = (__fp16*)data; for (int i = 0; i < size; i++) { - vse16_v_f16m1(ptr, _v, vl); + __riscv_vse16_v_f16m1(ptr, _v, vl); ptr += packn; } } -#endif // __riscv_zfh +#endif // __riscv_zvfh #endif // __riscv_vector template diff --git a/src/net.cpp b/src/net.cpp index 904e14cb2f7..40fea36a03c 100644 --- a/src/net.cpp +++ b/src/net.cpp @@ -639,15 +639,15 @@ int NetPrivate::convert_layout(Mat& bottom_blob, const Layer* layer, const Optio } else #endif // NCNN_VFPV4 -#if NCNN_RVV - if (opt.use_fp16_storage && cpu_support_riscv_v() && cpu_support_riscv_zfh() && layer->support_fp16_storage) +#if NCNN_ZFH + if (opt.use_fp16_storage && (ncnn::cpu_support_riscv_zvfh() || (!ncnn::cpu_support_riscv_v() && ncnn::cpu_support_riscv_zfh())) && layer->support_fp16_storage) { Mat bottom_blob_fp16; cast_float32_to_float16(bottom_blob, bottom_blob_fp16, opt); bottom_blob = bottom_blob_fp16; } else -#endif // NCNN_RVV +#endif // NCNN_ZFH #if NCNN_BF16 if (opt.use_bf16_storage && layer->support_bf16_storage) { @@ -695,7 +695,7 @@ int NetPrivate::convert_layout(Mat& bottom_blob, const Layer* layer, const Optio dst_elempack = 8; else if (elemcount % 4 == 0) dst_elempack = 4; -#elif NCNN_RVV +#elif NCNN_RVV || NCNN_XTHEADVECTOR const int packn = ncnn::cpu_riscv_vlenb() / 4; if (elemcount % packn == 0) dst_elempack = packn; @@ -711,7 +711,7 @@ int NetPrivate::convert_layout(Mat& bottom_blob, const Layer* layer, const Optio dst_elempack = 8; else if (elemcount % 4 == 0) dst_elempack = 4; -#elif NCNN_RVV +#elif NCNN_RVV || NCNN_XTHEADVECTOR const int packn = ncnn::cpu_riscv_vlenb() / 2; if (elemcount % packn == 0) dst_elempack = packn; @@ -722,7 +722,7 @@ int NetPrivate::convert_layout(Mat& bottom_blob, const Layer* layer, const Optio } if (elembits == 8) { -#if NCNN_RVV +#if NCNN_RVV || NCNN_XTHEADVECTOR const int packn = ncnn::cpu_riscv_vlenb() / 1; if (elemcount % packn == 0) dst_elempack = packn; @@ -767,15 +767,15 @@ int NetPrivate::convert_layout(Mat& bottom_blob, const Layer* layer, const Optio } else #endif // NCNN_VFPV4 -#if NCNN_RVV - if (opt.use_fp16_storage && cpu_support_riscv_v() && cpu_support_riscv_zfh() && !layer->support_fp16_storage) +#if NCNN_ZFH + if (opt.use_fp16_storage && (ncnn::cpu_support_riscv_zvfh() || (!ncnn::cpu_support_riscv_v() && ncnn::cpu_support_riscv_zfh())) && !layer->support_fp16_storage) { Mat bottom_blob_fp32; cast_float16_to_float32(bottom_blob, bottom_blob_fp32, opt); bottom_blob = bottom_blob_fp32; } else -#endif // NCNN_RVV +#endif // NCNN_ZFH #if NCNN_BF16 if (opt.use_bf16_storage && !layer->support_bf16_storage) { @@ -2761,8 +2761,8 @@ int Extractor::extract(int blob_index, Mat& feat, int type) } else #endif // NCNN_VFPV4 -#if NCNN_RVV - if (d->opt.use_fp16_storage && cpu_support_riscv_v() && cpu_support_riscv_zfh() && (type == 0)) +#if NCNN_ZVFH + if (d->opt.use_fp16_storage && cpu_support_riscv_zvfh() && (type == 0)) { if (feat.elembits() == 16) { @@ -2772,7 +2772,7 @@ int Extractor::extract(int blob_index, Mat& feat, int type) } } else -#endif // NCNN_RVV +#endif // NCNN_ZVFH #if NCNN_BF16 if (d->opt.use_bf16_storage && (type == 0)) { diff --git a/src/platform.h.in b/src/platform.h.in index a0b372d8296..023d5d11023 100644 --- a/src/platform.h.in +++ b/src/platform.h.in @@ -62,6 +62,9 @@ #cmakedefine01 NCNN_LSX #cmakedefine01 NCNN_MMI #cmakedefine01 NCNN_RVV +#cmakedefine01 NCNN_ZFH +#cmakedefine01 NCNN_ZVFH +#cmakedefine01 NCNN_XTHEADVECTOR #cmakedefine01 NCNN_INT8 #cmakedefine01 NCNN_BF16 #cmakedefine01 NCNN_FORCE_INLINE diff --git a/tests/testutil.cpp b/tests/testutil.cpp index ffc12bccfa3..3cf3e605e3b 100644 --- a/tests/testutil.cpp +++ b/tests/testutil.cpp @@ -342,13 +342,13 @@ static int convert_to_optimal_layout(const ncnn::Mat& a, ncnn::Mat& a4, const nc } else #endif // NCNN_VFPV4 -#if NCNN_RVV - if (opt.use_fp16_storage && ncnn::cpu_support_riscv_v() && ncnn::cpu_support_riscv_zfh() && op->support_fp16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING)) +#if NCNN_ZFH + if (opt.use_fp16_storage && (ncnn::cpu_support_riscv_zvfh() || (!ncnn::cpu_support_riscv_v() && ncnn::cpu_support_riscv_zfh())) && op->support_fp16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING)) { ncnn::cast_float32_to_float16(a, a4, opt); } else -#endif // NCNN_RVV +#endif // NCNN_ZFH #if NCNN_BF16 if (opt.use_bf16_storage && op->support_bf16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING)) { @@ -394,8 +394,8 @@ static int convert_to_optimal_layout(const ncnn::Mat& a, ncnn::Mat& a4, const nc dst_elempack = 8; else if (elemcount % 4 == 0) dst_elempack = 4; -#elif NCNN_RVV - const int packn = ncnn::cpu_riscv_vlenb() / (elembits / 8); +#elif NCNN_RVV || NCNN_XTHEADVECTOR + const int packn = ncnn::cpu_riscv_vlenb() / 4; if (elemcount % packn == 0) dst_elempack = packn; #else @@ -410,7 +410,7 @@ static int convert_to_optimal_layout(const ncnn::Mat& a, ncnn::Mat& a4, const nc dst_elempack = 8; else if (elemcount % 4 == 0) dst_elempack = 4; -#elif NCNN_RVV +#elif NCNN_RVV || NCNN_XTHEADVECTOR const int packn = ncnn::cpu_riscv_vlenb() / 2; if (elemcount % packn == 0) dst_elempack = packn; @@ -421,7 +421,7 @@ static int convert_to_optimal_layout(const ncnn::Mat& a, ncnn::Mat& a4, const nc } if (elembits == 8) { -#if NCNN_RVV +#if NCNN_RVV || NCNN_XTHEADVECTOR const int packn = ncnn::cpu_riscv_vlenb() / 1; if (elemcount % packn == 0) dst_elempack = packn; @@ -470,13 +470,13 @@ static int convert_to_vanilla_layout(const ncnn::Mat& c4, ncnn::Mat& c, const nc } else #endif // NCNN_VFPV4 -#if NCNN_RVV - if (opt.use_fp16_storage && ncnn::cpu_support_riscv_v() && ncnn::cpu_support_riscv_zfh() && op->support_fp16_storage && c4_unpacked.elembits() == 16) +#if NCNN_ZFH + if (opt.use_fp16_storage && (ncnn::cpu_support_riscv_zvfh() || (!ncnn::cpu_support_riscv_v() && ncnn::cpu_support_riscv_zfh())) && op->support_fp16_storage && c4_unpacked.elembits() == 16) { ncnn::cast_float16_to_float32(c4_unpacked, c, opt); } else -#endif // NCNN_RVV +#endif // NCNN_ZFH #if NCNN_BF16 if (opt.use_bf16_storage && op->support_bf16_storage && c4_unpacked.elembits() == 16) { diff --git a/toolchains/c906-v222.toolchain.cmake b/toolchains/c906-v222.toolchain.cmake deleted file mode 100644 index 02c065e5964..00000000000 --- a/toolchains/c906-v222.toolchain.cmake +++ /dev/null @@ -1,41 +0,0 @@ -set(CMAKE_SYSTEM_NAME Linux) -set(CMAKE_SYSTEM_PROCESSOR riscv64) -set(C906 True) - -if(DEFINED ENV{RISCV_ROOT_PATH}) - file(TO_CMAKE_PATH $ENV{RISCV_ROOT_PATH} RISCV_ROOT_PATH) -else() - message(FATAL_ERROR "RISCV_ROOT_PATH env must be defined") -endif() - -set(RISCV_ROOT_PATH ${RISCV_ROOT_PATH} CACHE STRING "root path to riscv toolchain") - -set(CMAKE_C_COMPILER "${RISCV_ROOT_PATH}/bin/riscv64-unknown-linux-gnu-gcc") -set(CMAKE_CXX_COMPILER "${RISCV_ROOT_PATH}/bin/riscv64-unknown-linux-gnu-g++") - -set(CMAKE_FIND_ROOT_PATH "${RISCV_ROOT_PATH}/riscv64-unknown-linux-gnu") - -set(CMAKE_SYSROOT "${RISCV_ROOT_PATH}/sysroot") - -if(NOT CMAKE_FIND_ROOT_PATH_MODE_PROGRAM) - set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) -endif() -if(NOT CMAKE_FIND_ROOT_PATH_MODE_LIBRARY) - set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) -endif() -if(NOT CMAKE_FIND_ROOT_PATH_MODE_INCLUDE) - set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) -endif() -if(NOT CMAKE_FIND_ROOT_PATH_MODE_PACKAGE) - set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY) -endif() - -set(CMAKE_C_FLAGS "-march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c906 -DC906=1 -static") -set(CMAKE_CXX_FLAGS "-march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c906 -DC906=1 -static") - -# cache flags -set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}" CACHE STRING "c flags") -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}" CACHE STRING "c++ flags") - -# export RISCV_ROOT_PATH=/home/nihui/osd/Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.2.2 -# cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/c906-v222.toolchain.cmake -DCMAKE_BUILD_TYPE=release -DNCNN_BUILD_TESTS=ON -DNCNN_OPENMP=OFF -DNCNN_THREADS=OFF -DNCNN_RUNTIME_CPU=OFF -DNCNN_RVV=ON -DNCNN_SIMPLEOCV=ON -DNCNN_BUILD_EXAMPLES=ON .. diff --git a/toolchains/c906-v223.toolchain.cmake b/toolchains/c906-v223.toolchain.cmake deleted file mode 100644 index d20d2d52398..00000000000 --- a/toolchains/c906-v223.toolchain.cmake +++ /dev/null @@ -1,41 +0,0 @@ -set(CMAKE_SYSTEM_NAME Linux) -set(CMAKE_SYSTEM_PROCESSOR riscv64) -set(C906 True) - -if(DEFINED ENV{RISCV_ROOT_PATH}) - file(TO_CMAKE_PATH $ENV{RISCV_ROOT_PATH} RISCV_ROOT_PATH) -else() - message(FATAL_ERROR "RISCV_ROOT_PATH env must be defined") -endif() - -set(RISCV_ROOT_PATH ${RISCV_ROOT_PATH} CACHE STRING "root path to riscv toolchain") - -set(CMAKE_C_COMPILER "${RISCV_ROOT_PATH}/bin/riscv64-unknown-linux-gnu-gcc") -set(CMAKE_CXX_COMPILER "${RISCV_ROOT_PATH}/bin/riscv64-unknown-linux-gnu-g++") - -set(CMAKE_FIND_ROOT_PATH "${RISCV_ROOT_PATH}/riscv64-unknown-linux-gnu") - -set(CMAKE_SYSROOT "${RISCV_ROOT_PATH}/sysroot") - -if(NOT CMAKE_FIND_ROOT_PATH_MODE_PROGRAM) - set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) -endif() -if(NOT CMAKE_FIND_ROOT_PATH_MODE_LIBRARY) - set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) -endif() -if(NOT CMAKE_FIND_ROOT_PATH_MODE_INCLUDE) - set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) -endif() -if(NOT CMAKE_FIND_ROOT_PATH_MODE_PACKAGE) - set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY) -endif() - -set(CMAKE_C_FLAGS "-march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c906 -DC906=1 -static") -set(CMAKE_CXX_FLAGS "-march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c906 -DC906=1 -static") - -# cache flags -set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}" CACHE STRING "c flags") -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}" CACHE STRING "c++ flags") - -# export RISCV_ROOT_PATH=/home/nihui/osd/Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.2.3 -# cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/c906-v223.toolchain.cmake -DCMAKE_BUILD_TYPE=release -DNCNN_BUILD_TESTS=ON -DNCNN_OPENMP=OFF -DNCNN_THREADS=OFF -DNCNN_RUNTIME_CPU=OFF -DNCNN_RVV=ON -DNCNN_SIMPLEOCV=ON -DNCNN_BUILD_EXAMPLES=ON .. diff --git a/toolchains/c906-v226.toolchain.cmake b/toolchains/c906-v301.toolchain.cmake similarity index 72% rename from toolchains/c906-v226.toolchain.cmake rename to toolchains/c906-v301.toolchain.cmake index aab3dd60ee6..3ef3c0f655e 100644 --- a/toolchains/c906-v226.toolchain.cmake +++ b/toolchains/c906-v301.toolchain.cmake @@ -30,12 +30,12 @@ if(NOT CMAKE_FIND_ROOT_PATH_MODE_PACKAGE) set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY) endif() -set(CMAKE_C_FLAGS "-march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c906 -DC906=1 -static") -set(CMAKE_CXX_FLAGS "-march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c906 -DC906=1 -static") +set(CMAKE_C_FLAGS "-march=rv64gc_zfh_xtheadvector_xtheadc -mabi=lp64d -mtune=c906 -DC906=1 -D__riscv_zvfh=1 -static") +set(CMAKE_CXX_FLAGS "-march=rv64gc_zfh_xtheadvector_xtheadc -mabi=lp64d -mtune=c906 -DC906=1 -D__riscv_zvfh=1 -static") # cache flags set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}" CACHE STRING "c flags") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}" CACHE STRING "c++ flags") -# export RISCV_ROOT_PATH=/home/nihui/osd/Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.2.6 -# cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/c906-v226.toolchain.cmake -DCMAKE_BUILD_TYPE=release -DNCNN_BUILD_TESTS=ON -DNCNN_OPENMP=OFF -DNCNN_THREADS=OFF -DNCNN_RUNTIME_CPU=OFF -DNCNN_RVV=ON -DNCNN_SIMPLEOCV=ON -DNCNN_BUILD_EXAMPLES=ON .. +# export RISCV_ROOT_PATH=/home/nihui/osd/Xuantie-900-gcc-linux-6.6.0-glibc-x86_64-V3.0.1 +# cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/c906-v301.toolchain.cmake -DCMAKE_BUILD_TYPE=release -DNCNN_BUILD_TESTS=ON -DNCNN_OPENMP=OFF -DNCNN_THREADS=OFF -DNCNN_RUNTIME_CPU=OFF -DNCNN_RVV=OFF -DNCNN_XTHEADVECTOR=ON -DNCNN_ZFH=ON -DNCNN_SIMPLEOCV=ON -DNCNN_BUILD_EXAMPLES=ON .. diff --git a/toolchains/c910-v240.toolchain.cmake b/toolchains/c910-v240.toolchain.cmake deleted file mode 100644 index 3b63f6b388b..00000000000 --- a/toolchains/c910-v240.toolchain.cmake +++ /dev/null @@ -1,41 +0,0 @@ -set(CMAKE_SYSTEM_NAME Linux) -set(CMAKE_SYSTEM_PROCESSOR riscv64) -set(C906 True) - -if(DEFINED ENV{RISCV_ROOT_PATH}) - file(TO_CMAKE_PATH $ENV{RISCV_ROOT_PATH} RISCV_ROOT_PATH) -else() - message(FATAL_ERROR "RISCV_ROOT_PATH env must be defined") -endif() - -set(RISCV_ROOT_PATH ${RISCV_ROOT_PATH} CACHE STRING "root path to riscv toolchain") - -set(CMAKE_C_COMPILER "${RISCV_ROOT_PATH}/bin/riscv64-unknown-linux-gnu-gcc") -set(CMAKE_CXX_COMPILER "${RISCV_ROOT_PATH}/bin/riscv64-unknown-linux-gnu-g++") - -set(CMAKE_FIND_ROOT_PATH "${RISCV_ROOT_PATH}/riscv64-unknown-linux-gnu") - -set(CMAKE_SYSROOT "${RISCV_ROOT_PATH}/sysroot") - -if(NOT CMAKE_FIND_ROOT_PATH_MODE_PROGRAM) - set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) -endif() -if(NOT CMAKE_FIND_ROOT_PATH_MODE_LIBRARY) - set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) -endif() -if(NOT CMAKE_FIND_ROOT_PATH_MODE_INCLUDE) - set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) -endif() -if(NOT CMAKE_FIND_ROOT_PATH_MODE_PACKAGE) - set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY) -endif() - -set(CMAKE_C_FLAGS "-march=rv64gcxtheadc -mabi=lp64d -mtune=c910 -DC906=1 -static") -set(CMAKE_CXX_FLAGS "-march=rv64gcxtheadc -mabi=lp64d -mtune=c910 -DC906=1 -static") - -# cache flags -set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}" CACHE STRING "c flags") -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}" CACHE STRING "c++ flags") - -# export RISCV_ROOT_PATH=/home/nihui/osd/Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.4.0 -# cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/c910-v240.toolchain.cmake -DCMAKE_BUILD_TYPE=release -DNCNN_BUILD_TESTS=ON -DNCNN_OPENMP=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_RVV=OFF -DNCNN_SIMPLEOCV=ON -DNCNN_BUILD_EXAMPLES=ON .. diff --git a/toolchains/c906-v240.toolchain.cmake b/toolchains/c910-v301.toolchain.cmake similarity index 70% rename from toolchains/c906-v240.toolchain.cmake rename to toolchains/c910-v301.toolchain.cmake index e2a402d1426..503d0c73f50 100644 --- a/toolchains/c906-v240.toolchain.cmake +++ b/toolchains/c910-v301.toolchain.cmake @@ -30,12 +30,12 @@ if(NOT CMAKE_FIND_ROOT_PATH_MODE_PACKAGE) set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY) endif() -set(CMAKE_C_FLAGS "-march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c906 -DC906=1 -static") -set(CMAKE_CXX_FLAGS "-march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c906 -DC906=1 -static") +set(CMAKE_C_FLAGS "-march=rv64gc_zfh_xtheadvector_xtheadc -mabi=lp64d -mtune=c910 -DC906=1 -D__riscv_zvfh=1 -static") +set(CMAKE_CXX_FLAGS "-march=rv64gc_zfh_xtheadvector_xtheadc -mabi=lp64d -mtune=c910 -DC906=1 -D__riscv_zvfh=1 -static") # cache flags set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}" CACHE STRING "c flags") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}" CACHE STRING "c++ flags") -# export RISCV_ROOT_PATH=/home/nihui/osd/Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.4.0 -# cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/c906-v240.toolchain.cmake -DCMAKE_BUILD_TYPE=release -DNCNN_BUILD_TESTS=ON -DNCNN_OPENMP=OFF -DNCNN_THREADS=OFF -DNCNN_RUNTIME_CPU=OFF -DNCNN_RVV=ON -DNCNN_SIMPLEOCV=ON -DNCNN_BUILD_EXAMPLES=ON .. +# export RISCV_ROOT_PATH=/home/nihui/osd/Xuantie-900-gcc-linux-6.6.0-glibc-x86_64-V3.0.1 +# cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/c910-v301.toolchain.cmake -DCMAKE_BUILD_TYPE=release -DNCNN_BUILD_TESTS=ON -DNCNN_OPENMP=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_RVV=OFF -DNCNN_XTHEADVECTOR=ON -DNCNN_ZFH=ON -DNCNN_SIMPLEOCV=ON -DNCNN_BUILD_EXAMPLES=ON .. diff --git a/toolchains/riscv64-unknown-linux-gnu.llvm-toolchain.cmake b/toolchains/riscv64-unknown-linux-gnu.llvm-toolchain.cmake index e9c6ad29e2a..c138edc76c1 100644 --- a/toolchains/riscv64-unknown-linux-gnu.llvm-toolchain.cmake +++ b/toolchains/riscv64-unknown-linux-gnu.llvm-toolchain.cmake @@ -9,8 +9,8 @@ endif() set(RISCV_ROOT_PATH ${RISCV_ROOT_PATH} CACHE STRING "root path to riscv gnu toolchain") -set(CMAKE_C_COMPILER "clang") -set(CMAKE_CXX_COMPILER "clang++") +set(CMAKE_C_COMPILER "${RISCV_ROOT_PATH}/bin/clang") +set(CMAKE_CXX_COMPILER "${RISCV_ROOT_PATH}/bin/clang++") set(CMAKE_SYSROOT "${RISCV_ROOT_PATH}/sysroot") set(CMAKE_C_COMPILER_TARGET "riscv64-unknown-linux-gnu") diff --git a/toolchains/th1520-v240.toolchain.cmake b/toolchains/th1520-v240.toolchain.cmake deleted file mode 100644 index 117d6a55c46..00000000000 --- a/toolchains/th1520-v240.toolchain.cmake +++ /dev/null @@ -1,39 +0,0 @@ -set(CMAKE_SYSTEM_NAME Linux) -set(CMAKE_SYSTEM_PROCESSOR riscv64) -set(C906 True) - -if(DEFINED ENV{RISCV_ROOT_PATH}) - file(TO_CMAKE_PATH $ENV{RISCV_ROOT_PATH} RISCV_ROOT_PATH) -else() - message(FATAL_ERROR "RISCV_ROOT_PATH env must be defined") -endif() - -set(RISCV_ROOT_PATH ${RISCV_ROOT_PATH} CACHE STRING "root path to riscv toolchain") - -set(CMAKE_C_COMPILER "${RISCV_ROOT_PATH}/bin/riscv64-unknown-linux-gnu-gcc") -set(CMAKE_CXX_COMPILER "${RISCV_ROOT_PATH}/bin/riscv64-unknown-linux-gnu-g++") - -set(CMAKE_FIND_ROOT_PATH "${RISCV_ROOT_PATH}/riscv64-unknown-linux-gnu") - -set(CMAKE_SYSROOT "${RISCV_ROOT_PATH}/sysroot") - -if(NOT CMAKE_FIND_ROOT_PATH_MODE_PROGRAM) - set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) -endif() -if(NOT CMAKE_FIND_ROOT_PATH_MODE_LIBRARY) - set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) -endif() -if(NOT CMAKE_FIND_ROOT_PATH_MODE_INCLUDE) - set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) -endif() -if(NOT CMAKE_FIND_ROOT_PATH_MODE_PACKAGE) - set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY) -endif() - -set(CMAKE_C_FLAGS "-march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c910 -DC906=1 -static") -set(CMAKE_CXX_FLAGS "-march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c910 -DC906=1 -static") - -# cache flags -set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}" CACHE STRING "c flags") -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}" CACHE STRING "c++ flags") -