diff --git a/configure.ac b/configure.ac
index 4578438607..7168e172ea 100644
--- a/configure.ac
+++ b/configure.ac
@@ -3077,10 +3077,14 @@ do
     AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_RISCV_CARRYLESS"
     ;;
   zkn|zkned)
-    # AES encrypt/decrpyt
+    # AES encrypt/decrpyt, SHA-2
     ENABLED_RISCV_ASM=yes
     AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_RISCV_SCALAR_CRYPTO_ASM"
     ;;
+  zv)
+    ENABLED_RISCV_ASM=yes
+    AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_RISCV_VECTOR"
+    ;;
   zvkg)
     # VGMUL, VHHSH
     ENABLED_RISCV_ASM=yes
@@ -3097,12 +3101,12 @@ do
     AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_RISCV_VECTOR_BASE_BIT_MANIPULATION"
     ;;
   zvkned)
-    # Vector AES
+    # Vector AES, SHA-2
     ENABLED_RISCV_ASM=yes
     AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_RISCV_VECTOR_CRYPTO_ASM"
     ;;
   *)
-    AC_MSG_ERROR([Invalid RISC-V option [yes,zbkb,zbb,zbc,zbkc,zkn,zkned,zvkg,zvbc,zvbb,zvkb,zvkned]: $ENABLED_RISCV_ASM.])
+    AC_MSG_ERROR([Invalid RISC-V option [yes,zbkb,zbb,zbc,zbkc,zkn,zkned,zv,zvkg,zvbc,zvbb,zvkb,zvkned]: $ENABLED_RISCV_ASM.])
     break
     ;;
   esac
diff --git a/src/include.am b/src/include.am
index 056f7ef559..4d96fd2ebe 100644
--- a/src/include.am
+++ b/src/include.am
@@ -971,17 +971,21 @@ if BUILD_CHACHA
 if BUILD_ARMASM_NEON
 src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/armv8-chacha.c
 else
+if BUILD_RISCV_ASM
+src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/riscv/riscv-64-chacha.c
+else
 src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/chacha.c
+endif !BUILD_RISCV_ASM
 if !BUILD_X86_ASM
 if BUILD_INTELASM
 src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/chacha_asm.S
-endif
-endif
-endif
+endif BUILD_INTELASM
+endif !BUILD_X86_ASM
+endif !BUILD_ARMASM_NEON
 if BUILD_POLY1305
 src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/chacha20_poly1305.c
-endif
-endif
+endif BUILD_POLY1305
+endif BUILD_CHACHA
 
 if !BUILD_INLINE
 src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/misc.c
diff --git a/wolfcrypt/src/chacha.c b/wolfcrypt/src/chacha.c
index c05ff1c65c..b87418a724 100644
--- a/wolfcrypt/src/chacha.c
+++ b/wolfcrypt/src/chacha.c
@@ -38,6 +38,9 @@ Public domain.
 #if defined(WOLFSSL_ARMASM) && !defined(WOLFSSL_ARMASM_NO_NEON)
     /* implementation is located in wolfcrypt/src/port/arm/armv8-chacha.c */
 
+#elif defined(WOLFSSL_RISCV_ASM)
+    /* implementation located in wolfcrypt/src/port/rsicv/riscv-64-chacha.c */
+
 #else
 #if defined(HAVE_CHACHA)
 
diff --git a/wolfcrypt/src/port/riscv/riscv-64-aes.c b/wolfcrypt/src/port/riscv/riscv-64-aes.c
index c438d252ad..292c854d18 100644
--- a/wolfcrypt/src/port/riscv/riscv-64-aes.c
+++ b/wolfcrypt/src/port/riscv/riscv-64-aes.c
@@ -75,18 +75,6 @@ static WC_INLINE void memcpy16(byte* out, const byte* in)
 #endif
 
 
-/* vd = vs2 << uimm */
-#define VSLL_VI(vd, vs2, uimm)                      \
-    ASM_WORD((0b100101 << 26) | (0b1 << 25) |       \
-             (0b011 << 12) | (0b1010111 << 0) |     \
-             (vd << 7) | (uimm << 15) | (vs2 << 20))
-/* vd = vs2 >> uimm */
-#define VSRL_VI(vd, vs2, uimm)                      \
-    ASM_WORD((0b101000 << 26) | (0b1 << 25) |       \
-             (0b011 << 12) | (0b1010111 << 0) |     \
-             (vd << 7) | (uimm << 15) | (vs2 << 20))
-
-
 /* Vector register set if equal: vd[i] = vs1[i] == vs2[i] ? 1 : 0 */
 #define VMSEQ_VV(vd, vs1, vs2)                      \
     ASM_WORD((0b011000 << 26) | (0b1 << 25) |       \
diff --git a/wolfcrypt/src/port/riscv/riscv-64-chacha.c b/wolfcrypt/src/port/riscv/riscv-64-chacha.c
new file mode 100644
index 0000000000..4087c41064
--- /dev/null
+++ b/wolfcrypt/src/port/riscv/riscv-64-chacha.c
@@ -0,0 +1,2345 @@
+/* riscv-64-chacha.c
+ *
+ * Copyright (C) 2006-2024 wolfSSL Inc.
+ *
+ * This file is part of wolfSSL.
+ *
+ * wolfSSL is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * wolfSSL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
+ */
+
+/* The paper NEON crypto by Daniel J. Bernstein and Peter Schwabe was used to
+ * optimize for ARM:
+ *   https://cryptojedi.org/papers/veccrypto-20120320.pdf
+ */
+
+#ifdef HAVE_CONFIG_H
+    #include <config.h>
+#endif
+
+#include <wolfssl/wolfcrypt/settings.h>
+#include <wolfssl/wolfcrypt/port/riscv/riscv-64-asm.h>
+
+#ifdef WOLFSSL_RISCV_ASM
+#ifdef HAVE_CHACHA
+
+#include <wolfssl/wolfcrypt/chacha.h>
+#include <wolfssl/wolfcrypt/error-crypt.h>
+#include <wolfssl/wolfcrypt/logging.h>
+#include <wolfssl/wolfcrypt/cpuid.h>
+#ifdef NO_INLINE
+    #include <wolfssl/wolfcrypt/misc.h>
+#else
+    #define WOLFSSL_MISC_INCLUDED
+    #include <wolfcrypt/src/misc.c>
+#endif
+
+#ifdef CHACHA_AEAD_TEST
+    #include <stdio.h>
+#endif
+
+#ifdef CHACHA_TEST
+    #include <stdio.h>
+#endif
+
+/* Number of rounds */
+#define ROUNDS  20
+
+#define U32C(v) (v##U)
+#define U32V(v) ((word32)(v) & U32C(0xFFFFFFFF))
+#define U8TO32_LITTLE(p) (((word32*)(p))[0])
+
+#define PLUS(v,w)   (U32V((v) + (w)))
+#define PLUSONE(v)  (PLUS((v),1))
+
+#define ARM_SIMD_LEN_BYTES 16
+
+/**
+ * Set up iv(nonce). Earlier versions used 64 bits instead of 96, this version
+ * uses the typical AEAD 96 bit nonce and can do record sizes of 256 GB.
+ */
+int wc_Chacha_SetIV(ChaCha* ctx, const byte* inIv, word32 counter)
+{
+    word32 temp[CHACHA_IV_WORDS];/* used for alignment of memory */
+
+    if (ctx == NULL)
+        return BAD_FUNC_ARG;
+
+    XMEMCPY(temp, inIv, CHACHA_IV_BYTES);
+
+    ctx->left = 0;
+    ctx->X[CHACHA_IV_BYTES+0] = counter;           /* block counter */
+    ctx->X[CHACHA_IV_BYTES+1] = temp[0]; /* fixed variable from nonce */
+    ctx->X[CHACHA_IV_BYTES+2] = temp[1]; /* counter from nonce */
+    ctx->X[CHACHA_IV_BYTES+3] = temp[2]; /* counter from nonce */
+
+    return 0;
+}
+
+/* "expand 32-byte k" as unsigned 32 byte */
+static const word32 sigma[4] = {0x61707865, 0x3320646e, 0x79622d32, 0x6b206574};
+/* "expand 16-byte k" as unsigned 16 byte */
+static const word32 tau[4] = {0x61707865, 0x3120646e, 0x79622d36, 0x6b206574};
+
+/**
+ * Key setup. 8 word iv (nonce)
+ */
+int wc_Chacha_SetKey(ChaCha* ctx, const byte* key, word32 keySz)
+{
+    const word32* constants;
+    const byte*   k;
+
+#ifdef XSTREAM_ALIGN
+    word32 alignKey[8];
+#endif
+
+    if (ctx == NULL)
+        return BAD_FUNC_ARG;
+
+    if (keySz != (CHACHA_MAX_KEY_SZ/2) && keySz != CHACHA_MAX_KEY_SZ)
+        return BAD_FUNC_ARG;
+
+#ifdef XSTREAM_ALIGN
+    if ((wc_ptr_t)key % 4) {
+        WOLFSSL_MSG("wc_ChachaSetKey unaligned key");
+        XMEMCPY(alignKey, key, keySz);
+        k = (byte*)alignKey;
+    }
+    else {
+        k = key;
+    }
+#else
+    k = key;
+#endif /* XSTREAM_ALIGN */
+
+    ctx->X[4] = U8TO32_LITTLE(k +  0);
+    ctx->X[5] = U8TO32_LITTLE(k +  4);
+    ctx->X[6] = U8TO32_LITTLE(k +  8);
+    ctx->X[7] = U8TO32_LITTLE(k + 12);
+    if (keySz == CHACHA_MAX_KEY_SZ) {
+        k += 16;
+        constants = sigma;
+    }
+    else {
+        constants = tau;
+    }
+    ctx->X[ 8] = U8TO32_LITTLE(k +  0);
+    ctx->X[ 9] = U8TO32_LITTLE(k +  4);
+    ctx->X[10] = U8TO32_LITTLE(k +  8);
+    ctx->X[11] = U8TO32_LITTLE(k + 12);
+    ctx->X[ 0] = constants[0];
+    ctx->X[ 1] = constants[1];
+    ctx->X[ 2] = constants[2];
+    ctx->X[ 3] = constants[3];
+    ctx->left = 0;
+
+    return 0;
+}
+
+
+#define CC_A0   "a4"
+#define CC_A1   "a5"
+#define CC_A2   "a6"
+#define CC_A3   "a7"
+#define CC_B0   "t3"
+#define CC_B1   "t4"
+#define CC_B2   "t5"
+#define CC_B3   "t6"
+#define CC_C0   "s2"
+#define CC_C1   "s3"
+#define CC_C2   "s4"
+#define CC_C3   "s5"
+#define CC_D0   "s6"
+#define CC_D1   "s7"
+#define CC_D2   "s8"
+#define CC_D3   "s9"
+#define CC_T0   "t0"
+#define CC_T1   "t1"
+#define CC_T2   "t2"
+#define CC_T3   "s1"
+
+#if defined(WOLFSSL_RISCV_VECTOR)
+
+static const word32 L_chacha20_vec_inc_first_word[] = {
+    0x1,
+    0x0,
+    0x0,
+    0x0,
+};
+
+#ifndef WOLFSSL_RISCV_VECTOR_BASE_BIT_MANIPULATION 
+
+#define PART_ROUND_ODD_ABD_5(s, sr)                     \
+        VADD_VV(REG_V0, REG_V0, REG_V1)                 \
+        "add    " CC_A0 ", " CC_A0 ", " CC_B0 "\n\t"    \
+        VADD_VV(REG_V4, REG_V4, REG_V5)                 \
+        "add    " CC_A1 ", " CC_A1 ", " CC_B1 "\n\t"    \
+        VADD_VV(REG_V8, REG_V8, REG_V9)                 \
+        "add    " CC_A2 ", " CC_A2 ", " CC_B2 "\n\t"    \
+        VADD_VV(REG_V12, REG_V12, REG_V13)              \
+        "add    " CC_A3 ", " CC_A3 ", " CC_B3 "\n\t"    \
+        VADD_VV(REG_V16, REG_V16, REG_V17)              \
+        VXOR_VV(REG_V3, REG_V3, REG_V0)                 \
+        "xor    " CC_D0 ", " CC_D0 ", " CC_A0 "\n\t"    \
+        VXOR_VV(REG_V7, REG_V7, REG_V4)                 \
+        "xor    " CC_D1 ", " CC_D1 ", " CC_A1 "\n\t"    \
+        VXOR_VV(REG_V11, REG_V11, REG_V8)               \
+        "xor    " CC_D2 ", " CC_D2 ", " CC_A2 "\n\t"    \
+        VXOR_VV(REG_V15, REG_V15, REG_V12)              \
+        "xor    " CC_D3 ", " CC_D3 ", " CC_A3 "\n\t"    \
+        VXOR_VV(REG_V19, REG_V19, REG_V16)              \
+        VSLL_VI(REG_V20, REG_V3, s)                     \
+        "slli   " CC_T0 ", " CC_D0 ", " #s "\n\t"       \
+        VSLL_VI(REG_V21, REG_V7, s)                     \
+        "slli   " CC_T1 ", " CC_D1 ", " #s "\n\t"       \
+        VSLL_VI(REG_V22, REG_V11, s)                    \
+        "slli   " CC_T2 ", " CC_D2 ", " #s "\n\t"       \
+        VSLL_VI(REG_V23, REG_V15, s)                    \
+        "slli   " CC_T3 ", " CC_D3 ", " #s "\n\t"       \
+        VSLL_VI(REG_V24, REG_V19, s)                    \
+        VSRL_VI(REG_V3, REG_V3, sr)                     \
+        "srliw  " CC_D0 ", " CC_D0 ", " #sr "\n\t"      \
+        VSRL_VI(REG_V7, REG_V7, sr)                     \
+        "srliw  " CC_D1 ", " CC_D1 ", " #sr "\n\t"      \
+        VSRL_VI(REG_V11, REG_V11, sr)                   \
+        "srliw  " CC_D2 ", " CC_D2 ", " #sr "\n\t"      \
+        VSRL_VI(REG_V15, REG_V15, sr)                   \
+        "srliw  " CC_D3 ", " CC_D3 ", " #sr "\n\t"      \
+        VSRL_VI(REG_V19, REG_V19, sr)                   \
+        VOR_VV(REG_V3, REG_V3, REG_V20)                 \
+        "or     " CC_D0 ", " CC_D0 ", " CC_T0 "\n\t"    \
+        VOR_VV(REG_V7, REG_V7, REG_V21)                 \
+        "or     " CC_D1 ", " CC_D1 ", " CC_T1 "\n\t"    \
+        VOR_VV(REG_V11, REG_V11, REG_V22)               \
+        "or     " CC_D2 ", " CC_D2 ", " CC_T2 "\n\t"    \
+        VOR_VV(REG_V15, REG_V15, REG_V23)               \
+        "or     " CC_D3 ", " CC_D3 ", " CC_T3 "\n\t"    \
+        VOR_VV(REG_V19, REG_V19, REG_V24)
+
+#define PART_ROUND_ODD_CDB_5(s, sr)                     \
+        VADD_VV(REG_V2, REG_V2, REG_V3)                 \
+        "add    " CC_C0 ", " CC_C0 ", " CC_D0 "\n\t"    \
+        VADD_VV(REG_V6, REG_V6, REG_V7)                 \
+        "add    " CC_C1 ", " CC_C1 ", " CC_D1 "\n\t"    \
+        VADD_VV(REG_V10, REG_V10, REG_V11)              \
+        "add    " CC_C2 ", " CC_C2 ", " CC_D2 "\n\t"    \
+        VADD_VV(REG_V14, REG_V14, REG_V15)              \
+        "add    " CC_C3 ", " CC_C3 ", " CC_D3 "\n\t"    \
+        VADD_VV(REG_V18, REG_V18, REG_V19)              \
+        VXOR_VV(REG_V1, REG_V1, REG_V2)                 \
+        "xor    " CC_B0 ", " CC_B0 ", " CC_C0 "\n\t"    \
+        VXOR_VV(REG_V5, REG_V5, REG_V6)                 \
+        "xor    " CC_B1 ", " CC_B1 ", " CC_C1 "\n\t"    \
+        VXOR_VV(REG_V9, REG_V9, REG_V10)                \
+        "xor    " CC_B2 ", " CC_B2 ", " CC_C2 "\n\t"    \
+        VXOR_VV(REG_V13, REG_V13, REG_V14)              \
+        "xor    " CC_B3 ", " CC_B3 ", " CC_C3 "\n\t"    \
+        VXOR_VV(REG_V17, REG_V17, REG_V18)              \
+        VSLL_VI(REG_V20, REG_V1, s)                     \
+        "slli   " CC_T0 ", " CC_B0 ", " #s "\n\t"       \
+        VSLL_VI(REG_V21, REG_V5, s)                     \
+        "slli   " CC_T1 ", " CC_B1 ", " #s "\n\t"       \
+        VSLL_VI(REG_V22, REG_V9, s)                     \
+        "slli   " CC_T2 ", " CC_B2 ", " #s "\n\t"       \
+        VSLL_VI(REG_V23, REG_V13, s)                    \
+        "slli   " CC_T3 ", " CC_B3 ", " #s "\n\t"       \
+        VSLL_VI(REG_V24, REG_V17, s)                    \
+        VSRL_VI(REG_V1, REG_V1, sr)                     \
+        "srliw  " CC_B0 ", " CC_B0 ", " #sr "\n\t"      \
+        VSRL_VI(REG_V5, REG_V5, sr)                     \
+        "srliw  " CC_B1 ", " CC_B1 ", " #sr "\n\t"      \
+        VSRL_VI(REG_V9, REG_V9, sr)                     \
+        "srliw  " CC_B2 ", " CC_B2 ", " #sr "\n\t"      \
+        VSRL_VI(REG_V13, REG_V13, sr)                   \
+        "srliw  " CC_B3 ", " CC_B3 ", " #sr "\n\t"      \
+        VSRL_VI(REG_V17, REG_V17, sr)                   \
+        VOR_VV(REG_V1, REG_V1, REG_V20)                 \
+        "or     " CC_B0 ", " CC_B0 ", " CC_T0 "\n\t"    \
+        VOR_VV(REG_V5, REG_V5, REG_V21)                 \
+        "or     " CC_B1 ", " CC_B1 ", " CC_T1 "\n\t"    \
+        VOR_VV(REG_V9, REG_V9, REG_V22)                 \
+        "or     " CC_B2 ", " CC_B2 ", " CC_T2 "\n\t"    \
+        VOR_VV(REG_V13, REG_V13, REG_V23)               \
+        "or     " CC_B3 ", " CC_B3 ", " CC_T3 "\n\t"    \
+        VOR_VV(REG_V17, REG_V17, REG_V24)
+
+#define PART_ROUND_EVEN_ABD_5(s, sr)                    \
+        VADD_VV(REG_V0, REG_V0, REG_V1)                 \
+        "add    " CC_A0 ", " CC_A0 ", " CC_B1 "\n\t"    \
+        VADD_VV(REG_V4, REG_V4, REG_V5)                 \
+        "add    " CC_A1 ", " CC_A1 ", " CC_B2 "\n\t"    \
+        VADD_VV(REG_V8, REG_V8, REG_V9)                 \
+        "add    " CC_A2 ", " CC_A2 ", " CC_B3 "\n\t"    \
+        VADD_VV(REG_V12, REG_V12, REG_V13)              \
+        "add    " CC_A3 ", " CC_A3 ", " CC_B0 "\n\t"    \
+        VADD_VV(REG_V16, REG_V16, REG_V17)              \
+        VXOR_VV(REG_V3, REG_V3, REG_V0)                 \
+        "xor    " CC_D3 ", " CC_D3 ", " CC_A0 "\n\t"    \
+        VXOR_VV(REG_V7, REG_V7, REG_V4)                 \
+        "xor    " CC_D0 ", " CC_D0 ", " CC_A1 "\n\t"    \
+        VXOR_VV(REG_V11, REG_V11, REG_V8)               \
+        "xor    " CC_D1 ", " CC_D1 ", " CC_A2 "\n\t"    \
+        VXOR_VV(REG_V15, REG_V15, REG_V12)              \
+        "xor    " CC_D2 ", " CC_D2 ", " CC_A3 "\n\t"    \
+        VXOR_VV(REG_V19, REG_V19, REG_V16)              \
+        VSLL_VI(REG_V20, REG_V3, s)                     \
+        "slli   " CC_T0 ", " CC_D3 ", " #s "\n\t"       \
+        VSLL_VI(REG_V21, REG_V7, s)                     \
+        "slli   " CC_T1 ", " CC_D0 ", " #s "\n\t"       \
+        VSLL_VI(REG_V22, REG_V11, s)                    \
+        "slli   " CC_T2 ", " CC_D1 ", " #s "\n\t"       \
+        VSLL_VI(REG_V23, REG_V15, s)                    \
+        "slli   " CC_T3 ", " CC_D2 ", " #s "\n\t"       \
+        VSLL_VI(REG_V24, REG_V19, s)                    \
+        VSRL_VI(REG_V3, REG_V3, sr)                     \
+        "srliw  " CC_D3 ", " CC_D3 ", " #sr "\n\t"      \
+        VSRL_VI(REG_V7, REG_V7, sr)                     \
+        "srliw  " CC_D0 ", " CC_D0 ", " #sr "\n\t"      \
+        VSRL_VI(REG_V11, REG_V11, sr)                   \
+        "srliw  " CC_D1 ", " CC_D1 ", " #sr "\n\t"      \
+        VSRL_VI(REG_V15, REG_V15, sr)                   \
+        "srliw  " CC_D2 ", " CC_D2 ", " #sr "\n\t"      \
+        VSRL_VI(REG_V19, REG_V19, sr)                   \
+        VOR_VV(REG_V3, REG_V3, REG_V20)                 \
+        "or     " CC_D3 ", " CC_D3 ", " CC_T0 "\n\t"    \
+        VOR_VV(REG_V7, REG_V7, REG_V21)                 \
+        "or     " CC_D0 ", " CC_D0 ", " CC_T1 "\n\t"    \
+        VOR_VV(REG_V11, REG_V11, REG_V22)               \
+        "or     " CC_D1 ", " CC_D1 ", " CC_T2 "\n\t"    \
+        VOR_VV(REG_V15, REG_V15, REG_V23)               \
+        "or     " CC_D2 ", " CC_D2 ", " CC_T3 "\n\t"    \
+        VOR_VV(REG_V19, REG_V19, REG_V24)
+
+#define PART_ROUND_EVEN_CDB_5(s, sr)                    \
+        VADD_VV(REG_V2, REG_V2, REG_V3)                 \
+        "add    " CC_C2 ", " CC_C2 ", " CC_D3 "\n\t"    \
+        VADD_VV(REG_V6, REG_V6, REG_V7)                 \
+        "add    " CC_C3 ", " CC_C3 ", " CC_D0 "\n\t"    \
+        VADD_VV(REG_V10, REG_V10, REG_V11)              \
+        "add    " CC_C0 ", " CC_C0 ", " CC_D1 "\n\t"    \
+        VADD_VV(REG_V14, REG_V14, REG_V15)              \
+        "add    " CC_C1 ", " CC_C1 ", " CC_D2 "\n\t"    \
+        VADD_VV(REG_V18, REG_V18, REG_V19)              \
+        VXOR_VV(REG_V1, REG_V1, REG_V2)                 \
+        "xor    " CC_B1 ", " CC_B1 ", " CC_C2 "\n\t"    \
+        VXOR_VV(REG_V5, REG_V5, REG_V6)                 \
+        "xor    " CC_B2 ", " CC_B2 ", " CC_C3 "\n\t"    \
+        VXOR_VV(REG_V9, REG_V9, REG_V10)                \
+        "xor    " CC_B3 ", " CC_B3 ", " CC_C0 "\n\t"    \
+        VXOR_VV(REG_V13, REG_V13, REG_V14)              \
+        "xor    " CC_B0 ", " CC_B0 ", " CC_C1 "\n\t"    \
+        VXOR_VV(REG_V17, REG_V17, REG_V18)              \
+        VSLL_VI(REG_V20, REG_V1, s)                     \
+        "slli   " CC_T0 ", " CC_B1 ", " #s "\n\t"       \
+        VSLL_VI(REG_V21, REG_V5, s)                     \
+        "slli   " CC_T1 ", " CC_B2 ", " #s "\n\t"       \
+        VSLL_VI(REG_V22, REG_V9, s)                     \
+        "slli   " CC_T2 ", " CC_B3 ", " #s "\n\t"       \
+        VSLL_VI(REG_V23, REG_V13, s)                    \
+        "slli   " CC_T3 ", " CC_B0 ", " #s "\n\t"       \
+        VSLL_VI(REG_V24, REG_V17, s)                    \
+        VSRL_VI(REG_V1, REG_V1, sr)                     \
+        "srliw  " CC_B1 ", " CC_B1 ", " #sr "\n\t"      \
+        VSRL_VI(REG_V5, REG_V5, sr)                     \
+        "srliw  " CC_B2 ", " CC_B2 ", " #sr "\n\t"      \
+        VSRL_VI(REG_V9, REG_V9, sr)                     \
+        "srliw  " CC_B3 ", " CC_B3 ", " #sr "\n\t"      \
+        VSRL_VI(REG_V13, REG_V13, sr)                   \
+        "srliw  " CC_B0 ", " CC_B0 ", " #sr "\n\t"      \
+        VSRL_VI(REG_V17, REG_V17, sr)                   \
+        VOR_VV(REG_V1, REG_V1, REG_V20)                 \
+        "or     " CC_B1 ", " CC_B1 ", " CC_T0 "\n\t"    \
+        VOR_VV(REG_V5, REG_V5, REG_V21)                 \
+        "or     " CC_B2 ", " CC_B2 ", " CC_T1 "\n\t"    \
+        VOR_VV(REG_V9, REG_V9, REG_V22)                 \
+        "or     " CC_B3 ", " CC_B3 ", " CC_T2 "\n\t"    \
+        VOR_VV(REG_V13, REG_V13, REG_V23)               \
+        "or     " CC_B0 ", " CC_B0 ", " CC_T3 "\n\t"    \
+        VOR_VV(REG_V17, REG_V17, REG_V24)
+
+#elif !defined(WOLFSSL_RISCV_BASE_BIT_MANIPULATION )
+
+#define PART_ROUND_ODD_ABD_5(s, sr)                     \
+        VADD_VV(REG_V0, REG_V0, REG_V1)                 \
+        "add    " CC_A0 ", " CC_A0 ", " CC_B0 "\n\t"    \
+        VADD_VV(REG_V4, REG_V4, REG_V5)                 \
+        "add    " CC_A1 ", " CC_A1 ", " CC_B1 "\n\t"    \
+        VADD_VV(REG_V8, REG_V8, REG_V9)                 \
+        "add    " CC_A2 ", " CC_A2 ", " CC_B2 "\n\t"    \
+        VADD_VV(REG_V12, REG_V12, REG_V13)              \
+        "add    " CC_A3 ", " CC_A3 ", " CC_B3 "\n\t"    \
+        VADD_VV(REG_V16, REG_V16, REG_V17)              \
+        "xor    " CC_D0 ", " CC_D0 ", " CC_A0 "\n\t"    \
+        VXOR_VV(REG_V3, REG_V3, REG_V0)                 \
+        "xor    " CC_D1 ", " CC_D1 ", " CC_A1 "\n\t"    \
+        VXOR_VV(REG_V7, REG_V7, REG_V4)                 \
+        "xor    " CC_D2 ", " CC_D2 ", " CC_A2 "\n\t"    \
+        VXOR_VV(REG_V11, REG_V11, REG_V8)               \
+        "xor    " CC_D3 ", " CC_D3 ", " CC_A3 "\n\t"    \
+        VXOR_VV(REG_V15, REG_V15, REG_V12)              \
+        "slli   " CC_T0 ", " CC_D0 ", " #s "\n\t"       \
+        VXOR_VV(REG_V19, REG_V19, REG_V16)              \
+        "slli   " CC_T1 ", " CC_D1 ", " #s "\n\t"       \
+        VROR_VI(REG_V3, sr, REG_V3)                     \
+        "slli   " CC_T2 ", " CC_D2 ", " #s "\n\t"       \
+        VROR_VI(REG_V7, sr, REG_V7)                     \
+        "slli   " CC_T3 ", " CC_D3 ", " #s "\n\t"       \
+        VROR_VI(REG_V11, sr, REG_V11)                   \
+        "srliw  " CC_D0 ", " CC_D0 ", " #sr "\n\t"      \
+        VROR_VI(REG_V15, sr, REG_V15)                   \
+        "srliw  " CC_D1 ", " CC_D1 ", " #sr "\n\t"      \
+        VROR_VI(REG_V19, sr, REG_V19)                   \
+        "srliw  " CC_D2 ", " CC_D2 ", " #sr "\n\t"      \
+        "srliw  " CC_D3 ", " CC_D3 ", " #sr "\n\t"      \
+        "or     " CC_D0 ", " CC_D0 ", " CC_T0 "\n\t"    \
+        "or     " CC_D1 ", " CC_D1 ", " CC_T1 "\n\t"    \
+        "or     " CC_D2 ", " CC_D2 ", " CC_T2 "\n\t"    \
+        "or     " CC_D3 ", " CC_D3 ", " CC_T3 "\n\t"
+
+#define PART_ROUND_ODD_CDB_5(s, sr)                     \
+        VADD_VV(REG_V2, REG_V2, REG_V3)                 \
+        "add    " CC_C0 ", " CC_C0 ", " CC_D0 "\n\t"    \
+        VADD_VV(REG_V6, REG_V6, REG_V7)                 \
+        "add    " CC_C1 ", " CC_C1 ", " CC_D1 "\n\t"    \
+        VADD_VV(REG_V10, REG_V10, REG_V11)              \
+        "add    " CC_C2 ", " CC_C2 ", " CC_D2 "\n\t"    \
+        VADD_VV(REG_V14, REG_V14, REG_V15)              \
+        "add    " CC_C3 ", " CC_C3 ", " CC_D3 "\n\t"    \
+        VADD_VV(REG_V18, REG_V18, REG_V19)              \
+        "xor    " CC_B0 ", " CC_B0 ", " CC_C0 "\n\t"    \
+        VXOR_VV(REG_V1, REG_V1, REG_V2)                 \
+        "xor    " CC_B1 ", " CC_B1 ", " CC_C1 "\n\t"    \
+        VXOR_VV(REG_V5, REG_V5, REG_V6)                 \
+        "xor    " CC_B2 ", " CC_B2 ", " CC_C2 "\n\t"    \
+        VXOR_VV(REG_V9, REG_V9, REG_V10)                \
+        "xor    " CC_B3 ", " CC_B3 ", " CC_C3 "\n\t"    \
+        VXOR_VV(REG_V13, REG_V13, REG_V14)              \
+        "slli   " CC_T0 ", " CC_B0 ", " #s "\n\t"       \
+        VXOR_VV(REG_V17, REG_V17, REG_V18)              \
+        "slli   " CC_T1 ", " CC_B1 ", " #s "\n\t"       \
+        VROR_VI(REG_V1, sr, REG_V1)                     \
+        "slli   " CC_T2 ", " CC_B2 ", " #s "\n\t"       \
+        VROR_VI(REG_V5, sr, REG_V5)                     \
+        "slli   " CC_T3 ", " CC_B3 ", " #s "\n\t"       \
+        VROR_VI(REG_V9, sr, REG_V9)                     \
+        "srliw  " CC_B0 ", " CC_B0 ", " #sr "\n\t"      \
+        VROR_VI(REG_V13, sr, REG_V13)                   \
+        "srliw  " CC_B1 ", " CC_B1 ", " #sr "\n\t"      \
+        VROR_VI(REG_V17, sr, REG_V17)                   \
+        "srliw  " CC_B2 ", " CC_B2 ", " #sr "\n\t"      \
+        "srliw  " CC_B3 ", " CC_B3 ", " #sr "\n\t"      \
+        "or     " CC_B0 ", " CC_B0 ", " CC_T0 "\n\t"    \
+        "or     " CC_B1 ", " CC_B1 ", " CC_T1 "\n\t"    \
+        "or     " CC_B2 ", " CC_B2 ", " CC_T2 "\n\t"    \
+        "or     " CC_B3 ", " CC_B3 ", " CC_T3 "\n\t"
+
+#define PART_ROUND_EVEN_ABD_5(s, sr)                    \
+        VADD_VV(REG_V0, REG_V0, REG_V1)                 \
+        "add    " CC_A0 ", " CC_A0 ", " CC_B1 "\n\t"    \
+        VADD_VV(REG_V4, REG_V4, REG_V5)                 \
+        "add    " CC_A1 ", " CC_A1 ", " CC_B2 "\n\t"    \
+        VADD_VV(REG_V8, REG_V8, REG_V9)                 \
+        "add    " CC_A2 ", " CC_A2 ", " CC_B3 "\n\t"    \
+        VADD_VV(REG_V12, REG_V12, REG_V13)              \
+        "add    " CC_A3 ", " CC_A3 ", " CC_B0 "\n\t"    \
+        VADD_VV(REG_V16, REG_V16, REG_V17)              \
+        "xor    " CC_D3 ", " CC_D3 ", " CC_A0 "\n\t"    \
+        VXOR_VV(REG_V3, REG_V3, REG_V0)                 \
+        "xor    " CC_D0 ", " CC_D0 ", " CC_A1 "\n\t"    \
+        VXOR_VV(REG_V7, REG_V7, REG_V4)                 \
+        "xor    " CC_D1 ", " CC_D1 ", " CC_A2 "\n\t"    \
+        VXOR_VV(REG_V11, REG_V11, REG_V8)               \
+        "xor    " CC_D2 ", " CC_D2 ", " CC_A3 "\n\t"    \
+        VXOR_VV(REG_V15, REG_V15, REG_V12)              \
+        "slli   " CC_T0 ", " CC_D3 ", " #s "\n\t"       \
+        VXOR_VV(REG_V19, REG_V19, REG_V16)              \
+        "slli   " CC_T1 ", " CC_D0 ", " #s "\n\t"       \
+        VROR_VI(REG_V3, sr, REG_V3)                     \
+        "slli   " CC_T2 ", " CC_D1 ", " #s "\n\t"       \
+        VROR_VI(REG_V7, sr, REG_V7)                     \
+        "slli   " CC_T3 ", " CC_D2 ", " #s "\n\t"       \
+        VROR_VI(REG_V11, sr, REG_V11)                   \
+        "srliw  " CC_D3 ", " CC_D3 ", " #sr "\n\t"      \
+        VROR_VI(REG_V15, sr, REG_V15)                   \
+        "srliw  " CC_D0 ", " CC_D0 ", " #sr "\n\t"      \
+        VROR_VI(REG_V19, sr, REG_V19)                   \
+        "srliw  " CC_D1 ", " CC_D1 ", " #sr "\n\t"      \
+        "srliw  " CC_D2 ", " CC_D2 ", " #sr "\n\t"      \
+        "or     " CC_D3 ", " CC_D3 ", " CC_T0 "\n\t"    \
+        "or     " CC_D0 ", " CC_D0 ", " CC_T1 "\n\t"    \
+        "or     " CC_D1 ", " CC_D1 ", " CC_T2 "\n\t"    \
+        "or     " CC_D2 ", " CC_D2 ", " CC_T3 "\n\t"
+
+#define PART_ROUND_EVEN_CDB_5(s, sr)                    \
+        VADD_VV(REG_V2, REG_V2, REG_V3)                 \
+        "add    " CC_C2 ", " CC_C2 ", " CC_D3 "\n\t"    \
+        VADD_VV(REG_V6, REG_V6, REG_V7)                 \
+        "add    " CC_C3 ", " CC_C3 ", " CC_D0 "\n\t"    \
+        VADD_VV(REG_V10, REG_V10, REG_V11)              \
+        "add    " CC_C0 ", " CC_C0 ", " CC_D1 "\n\t"    \
+        VADD_VV(REG_V14, REG_V14, REG_V15)              \
+        "add    " CC_C1 ", " CC_C1 ", " CC_D2 "\n\t"    \
+        VADD_VV(REG_V18, REG_V18, REG_V19)              \
+        "xor    " CC_B1 ", " CC_B1 ", " CC_C2 "\n\t"    \
+        VXOR_VV(REG_V1, REG_V1, REG_V2)                 \
+        "xor    " CC_B2 ", " CC_B2 ", " CC_C3 "\n\t"    \
+        VXOR_VV(REG_V5, REG_V5, REG_V6)                 \
+        "xor    " CC_B3 ", " CC_B3 ", " CC_C0 "\n\t"    \
+        VXOR_VV(REG_V9, REG_V9, REG_V10)                \
+        "xor    " CC_B0 ", " CC_B0 ", " CC_C1 "\n\t"    \
+        VXOR_VV(REG_V13, REG_V13, REG_V14)              \
+        "slli   " CC_T0 ", " CC_B1 ", " #s "\n\t"       \
+        VXOR_VV(REG_V17, REG_V17, REG_V18)              \
+        "slli   " CC_T1 ", " CC_B2 ", " #s "\n\t"       \
+        VROR_VI(REG_V1, sr, REG_V1)                     \
+        "slli   " CC_T2 ", " CC_B3 ", " #s "\n\t"       \
+        VROR_VI(REG_V5, sr, REG_V5)                     \
+        "slli   " CC_T3 ", " CC_B0 ", " #s "\n\t"       \
+        VROR_VI(REG_V9, sr, REG_V9)                     \
+        "srliw  " CC_B1 ", " CC_B1 ", " #sr "\n\t"      \
+        VROR_VI(REG_V13, sr, REG_V13)                   \
+        "srliw  " CC_B2 ", " CC_B2 ", " #sr "\n\t"      \
+        VROR_VI(REG_V17, sr, REG_V17)                   \
+        "srliw  " CC_B3 ", " CC_B3 ", " #sr "\n\t"      \
+        "srliw  " CC_B0 ", " CC_B0 ", " #sr "\n\t"      \
+        "or     " CC_B1 ", " CC_B1 ", " CC_T0 "\n\t"    \
+        "or     " CC_B2 ", " CC_B2 ", " CC_T1 "\n\t"    \
+        "or     " CC_B3 ", " CC_B3 ", " CC_T2 "\n\t"    \
+        "or     " CC_B0 ", " CC_B0 ", " CC_T3 "\n\t"
+
+#else
+
+#define PART_ROUND_ODD_ABD_5(s, sr)                     \
+        VADD_VV(REG_V0, REG_V0, REG_V1)                 \
+        "add    " CC_A0 ", " CC_A0 ", " CC_B0 "\n\t"    \
+        VADD_VV(REG_V4, REG_V4, REG_V5)                 \
+        "add    " CC_A1 ", " CC_A1 ", " CC_B1 "\n\t"    \
+        VADD_VV(REG_V8, REG_V8, REG_V9)                 \
+        "add    " CC_A2 ", " CC_A2 ", " CC_B2 "\n\t"    \
+        VADD_VV(REG_V12, REG_V12, REG_V13)              \
+        "add    " CC_A3 ", " CC_A3 ", " CC_B3 "\n\t"    \
+        VADD_VV(REG_V16, REG_V16, REG_V17)              \
+        VXOR_VV(REG_V3, REG_V3, REG_V0)                 \
+        "xor    " CC_D0 ", " CC_D0 ", " CC_A0 "\n\t"    \
+        VXOR_VV(REG_V7, REG_V7, REG_V4)                 \
+        "xor    " CC_D1 ", " CC_D1 ", " CC_A1 "\n\t"    \
+        VXOR_VV(REG_V11, REG_V11, REG_V8)               \
+        "xor    " CC_D2 ", " CC_D2 ", " CC_A2 "\n\t"    \
+        VXOR_VV(REG_V15, REG_V15, REG_V12)              \
+        "xor    " CC_D3 ", " CC_D3 ", " CC_A3 "\n\t"    \
+        VXOR_VV(REG_V19, REG_V19, REG_V16)              \
+        VROR_VI(REG_V3, sr, REG_V3)                     \
+        RORIW(REG_S6, REG_S6, sr)                       \
+        VROR_VI(REG_V7, sr, REG_V7)                     \
+        RORIW(REG_S7, REG_S7, sr)                       \
+        VROR_VI(REG_V11, sr, REG_V11)                   \
+        RORIW(REG_S8, REG_S8, sr)                       \
+        VROR_VI(REG_V15, sr, REG_V15)                   \
+        RORIW(REG_S9, REG_S9, sr)                       \
+        VROR_VI(REG_V19, sr, REG_V19)
+
+#define PART_ROUND_ODD_CDB_5(s, sr)                     \
+        VADD_VV(REG_V2, REG_V2, REG_V3)                 \
+        "add    " CC_C0 ", " CC_C0 ", " CC_D0 "\n\t"    \
+        VADD_VV(REG_V6, REG_V6, REG_V7)                 \
+        "add    " CC_C1 ", " CC_C1 ", " CC_D1 "\n\t"    \
+        VADD_VV(REG_V10, REG_V10, REG_V11)              \
+        "add    " CC_C2 ", " CC_C2 ", " CC_D2 "\n\t"    \
+        VADD_VV(REG_V14, REG_V14, REG_V15)              \
+        "add    " CC_C3 ", " CC_C3 ", " CC_D3 "\n\t"    \
+        VADD_VV(REG_V18, REG_V18, REG_V19)              \
+        VXOR_VV(REG_V1, REG_V1, REG_V2)                 \
+        "xor    " CC_B0 ", " CC_B0 ", " CC_C0 "\n\t"    \
+        VXOR_VV(REG_V5, REG_V5, REG_V6)                 \
+        "xor    " CC_B1 ", " CC_B1 ", " CC_C1 "\n\t"    \
+        VXOR_VV(REG_V9, REG_V9, REG_V10)                \
+        "xor    " CC_B2 ", " CC_B2 ", " CC_C2 "\n\t"    \
+        VXOR_VV(REG_V13, REG_V13, REG_V14)              \
+        "xor    " CC_B3 ", " CC_B3 ", " CC_C3 "\n\t"    \
+        VXOR_VV(REG_V17, REG_V17, REG_V18)              \
+        VROR_VI(REG_V1, sr, REG_V1)                     \
+        RORIW(REG_T3, REG_T3, sr)                       \
+        VROR_VI(REG_V5, sr, REG_V5)                     \
+        RORIW(REG_T4, REG_T4, sr)                       \
+        VROR_VI(REG_V9, sr, REG_V9)                     \
+        RORIW(REG_T5, REG_T5, sr)                       \
+        VROR_VI(REG_V13, sr, REG_V13)                   \
+        RORIW(REG_T6, REG_T6, sr)                       \
+        VROR_VI(REG_V17, sr, REG_V17)
+
+#define PART_ROUND_EVEN_ABD_5(s, sr)                    \
+        VADD_VV(REG_V0, REG_V0, REG_V1)                 \
+        "add    " CC_A0 ", " CC_A0 ", " CC_B1 "\n\t"    \
+        VADD_VV(REG_V4, REG_V4, REG_V5)                 \
+        "add    " CC_A1 ", " CC_A1 ", " CC_B2 "\n\t"    \
+        VADD_VV(REG_V8, REG_V8, REG_V9)                 \
+        "add    " CC_A2 ", " CC_A2 ", " CC_B3 "\n\t"    \
+        VADD_VV(REG_V12, REG_V12, REG_V13)              \
+        "add    " CC_A3 ", " CC_A3 ", " CC_B0 "\n\t"    \
+        VADD_VV(REG_V16, REG_V16, REG_V17)              \
+        VXOR_VV(REG_V3, REG_V3, REG_V0)                 \
+        "xor    " CC_D3 ", " CC_D3 ", " CC_A0 "\n\t"    \
+        VXOR_VV(REG_V7, REG_V7, REG_V4)                 \
+        "xor    " CC_D0 ", " CC_D0 ", " CC_A1 "\n\t"    \
+        VXOR_VV(REG_V11, REG_V11, REG_V8)               \
+        "xor    " CC_D1 ", " CC_D1 ", " CC_A2 "\n\t"    \
+        VXOR_VV(REG_V15, REG_V15, REG_V12)              \
+        "xor    " CC_D2 ", " CC_D2 ", " CC_A3 "\n\t"    \
+        VXOR_VV(REG_V19, REG_V19, REG_V16)              \
+        VROR_VI(REG_V3, sr, REG_V3)                     \
+        RORIW(REG_S9, REG_S9, sr)                       \
+        VROR_VI(REG_V7, sr, REG_V7)                     \
+        RORIW(REG_S6, REG_S6, sr)                       \
+        VROR_VI(REG_V11, sr, REG_V11)                   \
+        RORIW(REG_S7, REG_S7, sr)                       \
+        VROR_VI(REG_V15, sr, REG_V15)                   \
+        RORIW(REG_S8, REG_S8, sr)                       \
+        VROR_VI(REG_V19, sr, REG_V19)
+
+#define PART_ROUND_EVEN_CDB_5(s, sr)                    \
+        VADD_VV(REG_V2, REG_V2, REG_V3)                 \
+        "add    " CC_C2 ", " CC_C2 ", " CC_D3 "\n\t"    \
+        VADD_VV(REG_V6, REG_V6, REG_V7)                 \
+        "add    " CC_C3 ", " CC_C3 ", " CC_D0 "\n\t"    \
+        VADD_VV(REG_V10, REG_V10, REG_V11)              \
+        "add    " CC_C0 ", " CC_C0 ", " CC_D1 "\n\t"    \
+        VADD_VV(REG_V14, REG_V14, REG_V15)              \
+        "add    " CC_C1 ", " CC_C1 ", " CC_D2 "\n\t"    \
+        VADD_VV(REG_V18, REG_V18, REG_V19)              \
+        VXOR_VV(REG_V1, REG_V1, REG_V2)                 \
+        "xor    " CC_B1 ", " CC_B1 ", " CC_C2 "\n\t"    \
+        VXOR_VV(REG_V5, REG_V5, REG_V6)                 \
+        "xor    " CC_B2 ", " CC_B2 ", " CC_C3 "\n\t"    \
+        VXOR_VV(REG_V9, REG_V9, REG_V10)                \
+        "xor    " CC_B3 ", " CC_B3 ", " CC_C0 "\n\t"    \
+        VXOR_VV(REG_V13, REG_V13, REG_V14)              \
+        "xor    " CC_B0 ", " CC_B0 ", " CC_C1 "\n\t"    \
+        VXOR_VV(REG_V17, REG_V17, REG_V18)              \
+        VROR_VI(REG_V1, sr, REG_V1)                     \
+        RORIW(REG_T4, REG_T4, sr)                       \
+        VROR_VI(REG_V5, sr, REG_V5)                     \
+        RORIW(REG_T5, REG_T5, sr)                       \
+        VROR_VI(REG_V9, sr, REG_V9)                     \
+        RORIW(REG_T6, REG_T6, sr)                       \
+        VROR_VI(REG_V13, sr, REG_V13)                   \
+        RORIW(REG_T3, REG_T3, sr)                       \
+        VROR_VI(REG_V17, sr, REG_V17)
+
+#endif
+
+#define QUARTER_ROUND_ODD_5()               \
+        /* a += b; d ^= a; d <<<= 16; */    \
+        PART_ROUND_ODD_ABD_5(16, 16)        \
+        /* c += d; b ^= c; b <<<= 12; */    \
+        PART_ROUND_ODD_CDB_5(12, 20)        \
+        /* a += b; d ^= a; d <<<= 8; */     \
+        PART_ROUND_ODD_ABD_5( 8, 24)        \
+        /* c += d; b ^= c; b <<<= 7; */     \
+        PART_ROUND_ODD_CDB_5( 7, 25)
+
+#define QUARTER_ROUND_EVEN_5()              \
+        /* a += b; d ^= a; d <<<= 16; */    \
+        PART_ROUND_EVEN_ABD_5(16, 16)       \
+        /* c += d; b ^= c; b <<<= 12; */    \
+        PART_ROUND_EVEN_CDB_5(12, 20)       \
+        /* a += b; d ^= a; d <<<= 8; */     \
+        PART_ROUND_EVEN_ABD_5( 8, 24)       \
+        /* c += d; b ^= c; b <<<= 7; */     \
+        PART_ROUND_EVEN_CDB_5( 7, 25)
+
+#define SHUFFLE_5(r, t, i)                  \
+        VRGATHER_VV(t + 0, i, r + 0)        \
+        VRGATHER_VV(t + 1, i, r + 4)        \
+        VRGATHER_VV(t + 2, i, r + 8)        \
+        VRGATHER_VV(t + 3, i, r + 12)       \
+        VRGATHER_VV(t + 4, i, r + 16)       \
+        VMV_V_V(r + 0, t + 0)               \
+        VMV_V_V(r + 4, t + 1)               \
+        VMV_V_V(r + 8, t + 2)               \
+        VMV_V_V(r + 12, t + 3)              \
+        VMV_V_V(r + 16, t + 4)
+
+#define ODD_SHUFFLE_5()                                                 \
+        /*    a=0,1,2,3; b=4,5,6,7; c=8,9,10,11; d=12,13,14,15          \
+         * => a=0,1,2,3; b=5,6,7,4; c=10,11,8,9; d=15,12,13,14 */       \
+        SHUFFLE_5(REG_V3, REG_V20, REG_V27)                             \
+        SHUFFLE_5(REG_V1, REG_V20, REG_V25)                             \
+        SHUFFLE_5(REG_V2, REG_V20, REG_V26)
+
+#define EVEN_SHUFFLE_5()                                                \
+        /*    a=0,1,2,3; b=5,6,7,4; c=10,11,8,9; d=15,12,13,14          \
+         * => a=0,1,2,3; b=4,5,6,7; c=8,9,10,11; d=12,13,14,15 */       \
+        SHUFFLE_5(REG_V3, REG_V20, REG_V25)                             \
+        SHUFFLE_5(REG_V1, REG_V20, REG_V27)                             \
+        SHUFFLE_5(REG_V2, REG_V20, REG_V26)
+
+static WC_INLINE void wc_chacha_encrypt_384(const word32* input, const byte* m,
+    byte* c, word32 bytes)
+{
+    word64 bytes64 = (word64)bytes;
+
+    __asm__ __volatile__ (
+        VSETIVLI(REG_X0, 4, 1, 1, 0b010, 0b000)
+        /* The layout of used vector registers is:
+         * v0-v3 - first block
+         * v4-v7 - second block
+         * v8-v11 - third block
+         * v12-v15 - fourth block
+         * v16-v19 - fifth block
+         * v20-v24 - temp/message
+         * v25-v27 - indeces for rotating words in vector
+         * v28-v31 - input
+         *
+         * v0  0  1  2  3
+         * v1  4  5  6  7
+         * v2  8  9 10 11
+         * v3 12 13 14 15
+         * load CHACHA state with indices placed as shown above
+         */
+
+        /* Load state to encrypt */
+        "mv     t2, %[input]\n\t"
+        VL4RE32_V(REG_V28, REG_T2)
+        VID_V(REG_V20)
+        VSLIDEDOWN_VI(REG_V25, REG_V20, 1)
+        VSLIDEUP_VI(REG_V25, REG_V20, 3)
+        VSLIDEDOWN_VI(REG_V26, REG_V20, 2)
+        VSLIDEUP_VI(REG_V26, REG_V20, 2)
+        VSLIDEDOWN_VI(REG_V27, REG_V20, 3)
+        VSLIDEUP_VI(REG_V27, REG_V20, 1)
+        "\n"
+    "L_chacha20_riscv_384_outer:\n\t"
+        /* Move state into regular registers */
+        "ld     a4,  0(%[input])\n\t"
+        "ld     a6,  8(%[input])\n\t"
+        "ld     t3, 16(%[input])\n\t"
+        "ld     t5, 24(%[input])\n\t"
+        "ld     s2, 32(%[input])\n\t"
+        "ld     s4, 40(%[input])\n\t"
+        "lw     s7, 52(%[input])\n\t"
+        "ld     s8, 56(%[input])\n\t"
+        "srli   a5, a4, 32\n\t"
+        "srli   a7, a6, 32\n\t"
+        "srli   t4, t3, 32\n\t"
+        "srli   t6, t5, 32\n\t"
+        "srli   s3, s2, 32\n\t"
+        "srli   s5, s4, 32\n\t"
+        "srli   s9, s8, 32\n\t"
+        VMV_X_S(REG_S6, REG_V31)
+        /* Move state into vector registers */
+        VMVR_V(REG_V0, REG_V28, 4)
+        VMVR_V(REG_V4, REG_V28, 4)
+        VMVR_V(REG_V8, REG_V28, 4)
+        VMVR_V(REG_V12, REG_V28, 4)
+        VMVR_V(REG_V16, REG_V28, 4)
+        /* Set counter word */
+        "addi   t1, s6, 1\n\t"
+        VMV_S_X(REG_V7, REG_T1)
+        "addi   t1, s6, 2\n\t"
+        VMV_S_X(REG_V11, REG_T1)
+        "addi   t1, s6, 3\n\t"
+        VMV_S_X(REG_V15, REG_T1)
+        "addi   t1, s6, 4\n\t"
+        VMV_S_X(REG_V19, REG_T1)
+        "addi   s6, s6, 5\n\t"
+        /* Set number of odd+even rounds to perform */
+        "li     a3, 10\n\t"
+        "\n"
+    "L_chacha20_riscv_384_loop:\n\t"
+        /* Odd Round */
+        QUARTER_ROUND_ODD_5()
+        ODD_SHUFFLE_5()
+        /* Even Round */
+        QUARTER_ROUND_EVEN_5()
+        EVEN_SHUFFLE_5()
+        "addi   a3, a3, -1\n\t"
+        "bnez   a3, L_chacha20_riscv_384_loop\n\t"
+        /* Load message */
+        "mv     t2, %[m]\n\t"
+        VL4RE32_V(REG_V20, REG_T2)
+        "addi   %[m], %[m], 64\n\t"
+        /* Add back state, XOR in message and store (load next block) */
+        /* BLOCK 1 */
+        VADD_VV(REG_V0, REG_V0, REG_V28)
+        VADD_VV(REG_V1, REG_V1, REG_V29)
+        VADD_VV(REG_V2, REG_V2, REG_V30)
+        VADD_VV(REG_V3, REG_V3, REG_V31)
+        VXOR_VV(REG_V0, REG_V0, REG_V20)
+        VXOR_VV(REG_V1, REG_V1, REG_V21)
+        VXOR_VV(REG_V2, REG_V2, REG_V22)
+        VXOR_VV(REG_V3, REG_V3, REG_V23)
+        "mv     t2, %[m]\n\t"
+        VL4RE32_V(REG_V20, REG_T2)
+        "addi   %[m], %[m], 64\n\t"
+        VMV_X_S(REG_T0, REG_V31)
+        "mv     t2, %[c]\n\t"
+        VS4R_V(REG_V0, REG_T2)
+        "addi   %[c], %[c], 64\n\t"
+        /* BLOCK 2 */
+        "addi   t0, t0, 1\n\t"
+        VMV_S_X(REG_V31, REG_T0)
+        VADD_VV(REG_V4, REG_V4, REG_V28)
+        VADD_VV(REG_V5, REG_V5, REG_V29)
+        VADD_VV(REG_V6, REG_V6, REG_V30)
+        VADD_VV(REG_V7, REG_V7, REG_V31)
+        VXOR_VV(REG_V4, REG_V4, REG_V20)
+        VXOR_VV(REG_V5, REG_V5, REG_V21)
+        VXOR_VV(REG_V6, REG_V6, REG_V22)
+        VXOR_VV(REG_V7, REG_V7, REG_V23)
+        "mv     t2, %[m]\n\t"
+        VL4RE32_V(REG_V20, REG_T2)
+        "addi   %[m], %[m], 64\n\t"
+        "mv     t2, %[c]\n\t"
+        VS4R_V(REG_V4, REG_T2)
+        "addi   %[c], %[c], 64\n\t"
+        /* BLOCK 3 */
+        "addi   t0, t0, 1\n\t"
+        VMV_S_X(REG_V31, REG_T0)
+        VADD_VV(REG_V8, REG_V8, REG_V28)
+        VADD_VV(REG_V9, REG_V9, REG_V29)
+        VADD_VV(REG_V10, REG_V10, REG_V30)
+        VADD_VV(REG_V11, REG_V11, REG_V31)
+        VXOR_VV(REG_V8, REG_V8, REG_V20)
+        VXOR_VV(REG_V9, REG_V9, REG_V21)
+        VXOR_VV(REG_V10, REG_V10, REG_V22)
+        VXOR_VV(REG_V11, REG_V11, REG_V23)
+        "mv     t2, %[m]\n\t"
+        VL4RE32_V(REG_V20, REG_T2)
+        "addi   %[m], %[m], 64\n\t"
+        "mv     t2, %[c]\n\t"
+        VS4R_V(REG_V8, REG_T2)
+        "addi   %[c], %[c], 64\n\t"
+        /* BLOCK 4 */
+        "addi   t0, t0, 1\n\t"
+        VMV_S_X(REG_V31, REG_T0)
+        VADD_VV(REG_V12, REG_V12, REG_V28)
+        VADD_VV(REG_V13, REG_V13, REG_V29)
+        VADD_VV(REG_V14, REG_V14, REG_V30)
+        VADD_VV(REG_V15, REG_V15, REG_V31)
+        VXOR_VV(REG_V12, REG_V12, REG_V20)
+        VXOR_VV(REG_V13, REG_V13, REG_V21)
+        VXOR_VV(REG_V14, REG_V14, REG_V22)
+        VXOR_VV(REG_V15, REG_V15, REG_V23)
+        "mv     t2, %[m]\n\t"
+        VL4RE32_V(REG_V20, REG_T2)
+        "addi   %[m], %[m], 64\n\t"
+        "mv     t2, %[c]\n\t"
+        VS4R_V(REG_V12, REG_T2)
+        "addi   %[c], %[c], 64\n\t"
+        /* BLOCK 5 */
+        "addi   t0, t0, 1\n\t"
+        VMV_S_X(REG_V31, REG_T0)
+        VADD_VV(REG_V16, REG_V16, REG_V28)
+        VADD_VV(REG_V17, REG_V17, REG_V29)
+        VADD_VV(REG_V18, REG_V18, REG_V30)
+        VADD_VV(REG_V19, REG_V19, REG_V31)
+        VXOR_VV(REG_V16, REG_V16, REG_V20)
+        VXOR_VV(REG_V17, REG_V17, REG_V21)
+        VXOR_VV(REG_V18, REG_V18, REG_V22)
+        VXOR_VV(REG_V19, REG_V19, REG_V23)
+        "mv     t2, %[m]\n\t"
+        VL4RE32_V(REG_V20, REG_T2)
+        "addi   %[m], %[m], 64\n\t"
+        "mv     t2, %[c]\n\t"
+        VS4R_V(REG_V16, REG_T2)
+        "addi   %[c], %[c], 64\n\t"
+        /* BLOCK 6 */
+        /* Move regular registers into vector registers for adding and xor */
+        "addi   t0, t0, 1\n\t"
+        VMV_S_X(REG_V0, REG_A4)
+        VMV_S_X(REG_V1, REG_T3)
+        VMV_S_X(REG_V2, REG_S2)
+        VMV_S_X(REG_V3, REG_S6)
+        VMV_S_X(REG_V4, REG_A5)
+        VMV_S_X(REG_V5, REG_T4)
+        VMV_S_X(REG_V6, REG_S3)
+        VMV_S_X(REG_V7, REG_S7)
+        VSLIDEUP_VI(REG_V0, REG_V4, 1)
+        VSLIDEUP_VI(REG_V1, REG_V5, 1)
+        VSLIDEUP_VI(REG_V2, REG_V6, 1)
+        VSLIDEUP_VI(REG_V3, REG_V7, 1)
+        VMV_S_X(REG_V4, REG_A6)
+        VMV_S_X(REG_V5, REG_T5)
+        VMV_S_X(REG_V6, REG_S4)
+        VMV_S_X(REG_V7, REG_S8)
+        VSLIDEUP_VI(REG_V0, REG_V4, 2)
+        VSLIDEUP_VI(REG_V1, REG_V5, 2)
+        VSLIDEUP_VI(REG_V2, REG_V6, 2)
+        VSLIDEUP_VI(REG_V3, REG_V7, 2)
+        VMV_S_X(REG_V4, REG_A7)
+        VMV_S_X(REG_V5, REG_T6)
+        VMV_S_X(REG_V6, REG_S5)
+        VMV_S_X(REG_V7, REG_S9)
+        VSLIDEUP_VI(REG_V0, REG_V4, 3)
+        VSLIDEUP_VI(REG_V1, REG_V5, 3)
+        VSLIDEUP_VI(REG_V2, REG_V6, 3)
+        VSLIDEUP_VI(REG_V3, REG_V7, 3)
+        VMV_S_X(REG_V31, REG_T0)
+        /* Add back state, XOR in message and store */
+        VADD_VV(REG_V0, REG_V0, REG_V28)
+        VADD_VV(REG_V1, REG_V1, REG_V29)
+        VADD_VV(REG_V2, REG_V2, REG_V30)
+        VADD_VV(REG_V3, REG_V3, REG_V31)
+        VXOR_VV(REG_V0, REG_V0, REG_V20)
+        VXOR_VV(REG_V1, REG_V1, REG_V21)
+        VXOR_VV(REG_V2, REG_V2, REG_V22)
+        VXOR_VV(REG_V3, REG_V3, REG_V23)
+        "mv     t2, %[c]\n\t"
+        VS4R_V(REG_V0, REG_T2)
+        "addi   %[c], %[c], 64\n\t"
+        "addi   %[bytes], %[bytes], -384\n\t"
+        "addi   t0, t0, 1\n\t"
+        VMV_S_X(REG_V31, REG_T0)
+        "bnez   %[bytes], L_chacha20_riscv_384_outer\n\t"
+        : [m] "+r" (m), [c] "+r" (c), [bytes] "+r" (bytes64)
+        : [input] "r" (input)
+        : "memory", "t0", "t1", "t2", "s1", "a3",
+          "t3", "t4", "t5", "t6",
+          "a4", "a5", "a6", "a7",
+          "s2", "s3", "s4", "s5",
+          "s6", "s7", "s8", "s9"
+    );
+}
+
+#ifndef WOLFSSL_RISCV_VECTOR_BASE_BIT_MANIPULATION 
+
+#define PART_ROUND_ODD_ABD(s, sr)                       \
+        "add    " CC_A0 ", " CC_A0 ", " CC_B0 "\n\t"    \
+        VADD_VV(REG_V0, REG_V0, REG_V1)                 \
+        "add    " CC_A1 ", " CC_A1 ", " CC_B1 "\n\t"    \
+        VADD_VV(REG_V4, REG_V4, REG_V5)                 \
+        "add    " CC_A2 ", " CC_A2 ", " CC_B2 "\n\t"    \
+        VADD_VV(REG_V8, REG_V8, REG_V9)                 \
+        "add    " CC_A3 ", " CC_A3 ", " CC_B3 "\n\t"    \
+        VXOR_VV(REG_V3, REG_V3, REG_V0)                 \
+        "xor    " CC_D0 ", " CC_D0 ", " CC_A0 "\n\t"    \
+        VXOR_VV(REG_V7, REG_V7, REG_V4)                 \
+        "xor    " CC_D1 ", " CC_D1 ", " CC_A1 "\n\t"    \
+        VXOR_VV(REG_V11, REG_V11, REG_V8)               \
+        "xor    " CC_D2 ", " CC_D2 ", " CC_A2 "\n\t"    \
+        VSLL_VI(REG_V20, REG_V3, s)                     \
+        "xor    " CC_D3 ", " CC_D3 ", " CC_A3 "\n\t"    \
+        VSLL_VI(REG_V21, REG_V7, s)                     \
+        "slli   " CC_T0 ", " CC_D0 ", " #s "\n\t"       \
+        VSLL_VI(REG_V22, REG_V11, s)                    \
+        "slli   " CC_T1 ", " CC_D1 ", " #s "\n\t"       \
+        VSRL_VI(REG_V3, REG_V3, sr)                     \
+        "slli   " CC_T2 ", " CC_D2 ", " #s "\n\t"       \
+        VSRL_VI(REG_V7, REG_V7, sr)                     \
+        "slli   " CC_T3 ", " CC_D3 ", " #s "\n\t"       \
+        VSRL_VI(REG_V11, REG_V11, sr)                   \
+        "srliw  " CC_D0 ", " CC_D0 ", " #sr "\n\t"      \
+        VOR_VV(REG_V3, REG_V3, REG_V20)                 \
+        "srliw  " CC_D1 ", " CC_D1 ", " #sr "\n\t"      \
+        VOR_VV(REG_V7, REG_V7, REG_V21)                 \
+        "srliw  " CC_D2 ", " CC_D2 ", " #sr "\n\t"      \
+        VOR_VV(REG_V11, REG_V11, REG_V22)               \
+        "srliw  " CC_D3 ", " CC_D3 ", " #sr "\n\t"      \
+        "or     " CC_D0 ", " CC_D0 ", " CC_T0 "\n\t"    \
+        "or     " CC_D1 ", " CC_D1 ", " CC_T1 "\n\t"    \
+        "or     " CC_D2 ", " CC_D2 ", " CC_T2 "\n\t"    \
+        "or     " CC_D3 ", " CC_D3 ", " CC_T3 "\n\t"
+
+#define PART_ROUND_ODD_CDB(s, sr)                       \
+        "add    " CC_C0 ", " CC_C0 ", " CC_D0 "\n\t"    \
+        VADD_VV(REG_V2, REG_V2, REG_V3)                 \
+        "add    " CC_C1 ", " CC_C1 ", " CC_D1 "\n\t"    \
+        VADD_VV(REG_V6, REG_V6, REG_V7)                 \
+        "add    " CC_C2 ", " CC_C2 ", " CC_D2 "\n\t"    \
+        VADD_VV(REG_V10, REG_V10, REG_V11)              \
+        "add    " CC_C3 ", " CC_C3 ", " CC_D3 "\n\t"    \
+        VXOR_VV(REG_V1, REG_V1, REG_V2)                 \
+        "xor    " CC_B0 ", " CC_B0 ", " CC_C0 "\n\t"    \
+        VXOR_VV(REG_V5, REG_V5, REG_V6)                 \
+        "xor    " CC_B1 ", " CC_B1 ", " CC_C1 "\n\t"    \
+        VXOR_VV(REG_V9, REG_V9, REG_V10)                \
+        "xor    " CC_B2 ", " CC_B2 ", " CC_C2 "\n\t"    \
+        VSLL_VI(REG_V20, REG_V1, s)                     \
+        "xor    " CC_B3 ", " CC_B3 ", " CC_C3 "\n\t"    \
+        VSLL_VI(REG_V21, REG_V5, s)                     \
+        "slli   " CC_T0 ", " CC_B0 ", " #s "\n\t"       \
+        VSLL_VI(REG_V22, REG_V9, s)                     \
+        "slli   " CC_T1 ", " CC_B1 ", " #s "\n\t"       \
+        VSRL_VI(REG_V1, REG_V1, sr)                     \
+        "slli   " CC_T2 ", " CC_B2 ", " #s "\n\t"       \
+        VSRL_VI(REG_V5, REG_V5, sr)                     \
+        "slli   " CC_T3 ", " CC_B3 ", " #s "\n\t"       \
+        VSRL_VI(REG_V9, REG_V9, sr)                     \
+        "srliw  " CC_B0 ", " CC_B0 ", " #sr "\n\t"      \
+        VOR_VV(REG_V1, REG_V1, REG_V20)                 \
+        "srliw  " CC_B1 ", " CC_B1 ", " #sr "\n\t"      \
+        VOR_VV(REG_V5, REG_V5, REG_V21)                 \
+        "srliw  " CC_B2 ", " CC_B2 ", " #sr "\n\t"      \
+        VOR_VV(REG_V9, REG_V9, REG_V22)                 \
+        "srliw  " CC_B3 ", " CC_B3 ", " #sr "\n\t"      \
+        "or     " CC_B0 ", " CC_B0 ", " CC_T0 "\n\t"    \
+        "or     " CC_B1 ", " CC_B1 ", " CC_T1 "\n\t"    \
+        "or     " CC_B2 ", " CC_B2 ", " CC_T2 "\n\t"    \
+        "or     " CC_B3 ", " CC_B3 ", " CC_T3 "\n\t"
+
+#define PART_ROUND_EVEN_ABD(s, sr)                      \
+        "add    " CC_A0 ", " CC_A0 ", " CC_B1 "\n\t"    \
+        VADD_VV(REG_V0, REG_V0, REG_V1)                 \
+        "add    " CC_A1 ", " CC_A1 ", " CC_B2 "\n\t"    \
+        VADD_VV(REG_V4, REG_V4, REG_V5)                 \
+        "add    " CC_A2 ", " CC_A2 ", " CC_B3 "\n\t"    \
+        VADD_VV(REG_V8, REG_V8, REG_V9)                 \
+        "add    " CC_A3 ", " CC_A3 ", " CC_B0 "\n\t"    \
+        VXOR_VV(REG_V3, REG_V3, REG_V0)                 \
+        "xor    " CC_D3 ", " CC_D3 ", " CC_A0 "\n\t"    \
+        VXOR_VV(REG_V7, REG_V7, REG_V4)                 \
+        "xor    " CC_D0 ", " CC_D0 ", " CC_A1 "\n\t"    \
+        VXOR_VV(REG_V11, REG_V11, REG_V8)               \
+        "xor    " CC_D1 ", " CC_D1 ", " CC_A2 "\n\t"    \
+        VSLL_VI(REG_V20, REG_V3, s)                     \
+        "xor    " CC_D2 ", " CC_D2 ", " CC_A3 "\n\t"    \
+        VSLL_VI(REG_V21, REG_V7, s)                     \
+        "slli   " CC_T0 ", " CC_D3 ", " #s "\n\t"       \
+        VSLL_VI(REG_V22, REG_V11, s)                    \
+        "slli   " CC_T1 ", " CC_D0 ", " #s "\n\t"       \
+        VSRL_VI(REG_V3, REG_V3, sr)                     \
+        "slli   " CC_T2 ", " CC_D1 ", " #s "\n\t"       \
+        VSRL_VI(REG_V7, REG_V7, sr)                     \
+        "slli   " CC_T3 ", " CC_D2 ", " #s "\n\t"       \
+        VSRL_VI(REG_V11, REG_V11, sr)                   \
+        "srliw  " CC_D3 ", " CC_D3 ", " #sr "\n\t"      \
+        VOR_VV(REG_V3, REG_V3, REG_V20)                 \
+        "srliw  " CC_D0 ", " CC_D0 ", " #sr "\n\t"      \
+        VOR_VV(REG_V7, REG_V7, REG_V21)                 \
+        "srliw  " CC_D1 ", " CC_D1 ", " #sr "\n\t"      \
+        VOR_VV(REG_V11, REG_V11, REG_V22)               \
+        "srliw  " CC_D2 ", " CC_D2 ", " #sr "\n\t"      \
+        "or     " CC_D3 ", " CC_D3 ", " CC_T0 "\n\t"    \
+        "or     " CC_D0 ", " CC_D0 ", " CC_T1 "\n\t"    \
+        "or     " CC_D1 ", " CC_D1 ", " CC_T2 "\n\t"    \
+        "or     " CC_D2 ", " CC_D2 ", " CC_T3 "\n\t"
+
+#define PART_ROUND_EVEN_CDB(s, sr)                      \
+        "add    " CC_C2 ", " CC_C2 ", " CC_D3 "\n\t"    \
+        VADD_VV(REG_V2, REG_V2, REG_V3)                 \
+        "add    " CC_C3 ", " CC_C3 ", " CC_D0 "\n\t"    \
+        VADD_VV(REG_V6, REG_V6, REG_V7)                 \
+        "add    " CC_C0 ", " CC_C0 ", " CC_D1 "\n\t"    \
+        VADD_VV(REG_V10, REG_V10, REG_V11)              \
+        "add    " CC_C1 ", " CC_C1 ", " CC_D2 "\n\t"    \
+        VXOR_VV(REG_V1, REG_V1, REG_V2)                 \
+        "xor    " CC_B1 ", " CC_B1 ", " CC_C2 "\n\t"    \
+        VXOR_VV(REG_V5, REG_V5, REG_V6)                 \
+        "xor    " CC_B2 ", " CC_B2 ", " CC_C3 "\n\t"    \
+        VXOR_VV(REG_V9, REG_V9, REG_V10)                \
+        "xor    " CC_B3 ", " CC_B3 ", " CC_C0 "\n\t"    \
+        VSLL_VI(REG_V20, REG_V1, s)                     \
+        "xor    " CC_B0 ", " CC_B0 ", " CC_C1 "\n\t"    \
+        VSLL_VI(REG_V21, REG_V5, s)                     \
+        "slli   " CC_T0 ", " CC_B1 ", " #s "\n\t"       \
+        VSLL_VI(REG_V22, REG_V9, s)                     \
+        "slli   " CC_T1 ", " CC_B2 ", " #s "\n\t"       \
+        VSRL_VI(REG_V1, REG_V1, sr)                     \
+        "slli   " CC_T2 ", " CC_B3 ", " #s "\n\t"       \
+        VSRL_VI(REG_V5, REG_V5, sr)                     \
+        "slli   " CC_T3 ", " CC_B0 ", " #s "\n\t"       \
+        VSRL_VI(REG_V9, REG_V9, sr)                     \
+        "srliw  " CC_B1 ", " CC_B1 ", " #sr "\n\t"      \
+        VOR_VV(REG_V1, REG_V1, REG_V20)                 \
+        "srliw  " CC_B2 ", " CC_B2 ", " #sr "\n\t"      \
+        VOR_VV(REG_V5, REG_V5, REG_V21)                 \
+        "srliw  " CC_B3 ", " CC_B3 ", " #sr "\n\t"      \
+        VOR_VV(REG_V9, REG_V9, REG_V22)                 \
+        "srliw  " CC_B0 ", " CC_B0 ", " #sr "\n\t"      \
+        "or     " CC_B1 ", " CC_B1 ", " CC_T0 "\n\t"    \
+        "or     " CC_B2 ", " CC_B2 ", " CC_T1 "\n\t"    \
+        "or     " CC_B3 ", " CC_B3 ", " CC_T2 "\n\t"    \
+        "or     " CC_B0 ", " CC_B0 ", " CC_T3 "\n\t"
+
+#elif !defined(WOLFSSL_RISCV_BASE_BIT_MANIPULATION )
+
+#define PART_ROUND_ODD_ABD(s, sr)                       \
+        "add    " CC_A0 ", " CC_A0 ", " CC_B0 "\n\t"    \
+        VADD_VV(REG_V0, REG_V0, REG_V1)                 \
+        "add    " CC_A1 ", " CC_A1 ", " CC_B1 "\n\t"    \
+        VADD_VV(REG_V4, REG_V4, REG_V5)                 \
+        "add    " CC_A2 ", " CC_A2 ", " CC_B2 "\n\t"    \
+        VADD_VV(REG_V8, REG_V8, REG_V9)                 \
+        "add    " CC_A3 ", " CC_A3 ", " CC_B3 "\n\t"    \
+        VXOR_VV(REG_V3, REG_V3, REG_V0)                 \
+        "xor    " CC_D0 ", " CC_D0 ", " CC_A0 "\n\t"    \
+        VXOR_VV(REG_V7, REG_V7, REG_V4)                 \
+        "xor    " CC_D1 ", " CC_D1 ", " CC_A1 "\n\t"    \
+        VXOR_VV(REG_V11, REG_V11, REG_V8)               \
+        "xor    " CC_D2 ", " CC_D2 ", " CC_A2 "\n\t"    \
+        VROR_VI(REG_V3, sr, REG_V3)                     \
+        "xor    " CC_D3 ", " CC_D3 ", " CC_A3 "\n\t"    \
+        VROR_VI(REG_V7, sr, REG_V7)                     \
+        "slli   " CC_T0 ", " CC_D0 ", " #s "\n\t"       \
+        VROR_VI(REG_V11, sr, REG_V11)                   \
+        "slli   " CC_T1 ", " CC_D1 ", " #s "\n\t"       \
+        "slli   " CC_T2 ", " CC_D2 ", " #s "\n\t"       \
+        "slli   " CC_T3 ", " CC_D3 ", " #s "\n\t"       \
+        "srliw  " CC_D0 ", " CC_D0 ", " #sr "\n\t"      \
+        "srliw  " CC_D1 ", " CC_D1 ", " #sr "\n\t"      \
+        "srliw  " CC_D2 ", " CC_D2 ", " #sr "\n\t"      \
+        "srliw  " CC_D3 ", " CC_D3 ", " #sr "\n\t"      \
+        "or     " CC_D0 ", " CC_D0 ", " CC_T0 "\n\t"    \
+        "or     " CC_D1 ", " CC_D1 ", " CC_T1 "\n\t"    \
+        "or     " CC_D2 ", " CC_D2 ", " CC_T2 "\n\t"    \
+        "or     " CC_D3 ", " CC_D3 ", " CC_T3 "\n\t"
+
+#define PART_ROUND_ODD_CDB(s, sr)                       \
+        "add    " CC_C0 ", " CC_C0 ", " CC_D0 "\n\t"    \
+        VADD_VV(REG_V2, REG_V2, REG_V3)                 \
+        "add    " CC_C1 ", " CC_C1 ", " CC_D1 "\n\t"    \
+        VADD_VV(REG_V6, REG_V6, REG_V7)                 \
+        "add    " CC_C2 ", " CC_C2 ", " CC_D2 "\n\t"    \
+        VADD_VV(REG_V10, REG_V10, REG_V11)              \
+        "add    " CC_C3 ", " CC_C3 ", " CC_D3 "\n\t"    \
+        VXOR_VV(REG_V1, REG_V1, REG_V2)                 \
+        "xor    " CC_B0 ", " CC_B0 ", " CC_C0 "\n\t"    \
+        VXOR_VV(REG_V5, REG_V5, REG_V6)                 \
+        "xor    " CC_B1 ", " CC_B1 ", " CC_C1 "\n\t"    \
+        VXOR_VV(REG_V9, REG_V9, REG_V10)                \
+        "xor    " CC_B2 ", " CC_B2 ", " CC_C2 "\n\t"    \
+        VROR_VI(REG_V1, sr, REG_V1)                     \
+        "xor    " CC_B3 ", " CC_B3 ", " CC_C3 "\n\t"    \
+        VROR_VI(REG_V5, sr, REG_V5)                     \
+        "slli   " CC_T0 ", " CC_B0 ", " #s "\n\t"       \
+        VROR_VI(REG_V9, sr, REG_V9)                     \
+        "slli   " CC_T1 ", " CC_B1 ", " #s "\n\t"       \
+        "slli   " CC_T2 ", " CC_B2 ", " #s "\n\t"       \
+        "slli   " CC_T3 ", " CC_B3 ", " #s "\n\t"       \
+        "srliw  " CC_B0 ", " CC_B0 ", " #sr "\n\t"      \
+        "srliw  " CC_B1 ", " CC_B1 ", " #sr "\n\t"      \
+        "srliw  " CC_B2 ", " CC_B2 ", " #sr "\n\t"      \
+        "srliw  " CC_B3 ", " CC_B3 ", " #sr "\n\t"      \
+        "or     " CC_B0 ", " CC_B0 ", " CC_T0 "\n\t"    \
+        "or     " CC_B1 ", " CC_B1 ", " CC_T1 "\n\t"    \
+        "or     " CC_B2 ", " CC_B2 ", " CC_T2 "\n\t"    \
+        "or     " CC_B3 ", " CC_B3 ", " CC_T3 "\n\t"
+
+#define PART_ROUND_EVEN_ABD(s, sr)                      \
+        "add    " CC_A0 ", " CC_A0 ", " CC_B1 "\n\t"    \
+        VADD_VV(REG_V0, REG_V0, REG_V1)                 \
+        "add    " CC_A1 ", " CC_A1 ", " CC_B2 "\n\t"    \
+        VADD_VV(REG_V4, REG_V4, REG_V5)                 \
+        "add    " CC_A2 ", " CC_A2 ", " CC_B3 "\n\t"    \
+        VADD_VV(REG_V8, REG_V8, REG_V9)                 \
+        "add    " CC_A3 ", " CC_A3 ", " CC_B0 "\n\t"    \
+        VXOR_VV(REG_V3, REG_V3, REG_V0)                 \
+        "xor    " CC_D3 ", " CC_D3 ", " CC_A0 "\n\t"    \
+        VXOR_VV(REG_V7, REG_V7, REG_V4)                 \
+        "xor    " CC_D0 ", " CC_D0 ", " CC_A1 "\n\t"    \
+        VXOR_VV(REG_V11, REG_V11, REG_V8)               \
+        "xor    " CC_D1 ", " CC_D1 ", " CC_A2 "\n\t"    \
+        VROR_VI(REG_V3, sr, REG_V3)                     \
+        "xor    " CC_D2 ", " CC_D2 ", " CC_A3 "\n\t"    \
+        VROR_VI(REG_V7, sr, REG_V7)                     \
+        "slli   " CC_T0 ", " CC_D3 ", " #s "\n\t"       \
+        VROR_VI(REG_V11, sr, REG_V11)                   \
+        "slli   " CC_T1 ", " CC_D0 ", " #s "\n\t"       \
+        "slli   " CC_T2 ", " CC_D1 ", " #s "\n\t"       \
+        "slli   " CC_T3 ", " CC_D2 ", " #s "\n\t"       \
+        "srliw  " CC_D3 ", " CC_D3 ", " #sr "\n\t"      \
+        "srliw  " CC_D0 ", " CC_D0 ", " #sr "\n\t"      \
+        "srliw  " CC_D1 ", " CC_D1 ", " #sr "\n\t"      \
+        "srliw  " CC_D2 ", " CC_D2 ", " #sr "\n\t"      \
+        "or     " CC_D3 ", " CC_D3 ", " CC_T0 "\n\t"    \
+        "or     " CC_D0 ", " CC_D0 ", " CC_T1 "\n\t"    \
+        "or     " CC_D1 ", " CC_D1 ", " CC_T2 "\n\t"    \
+        "or     " CC_D2 ", " CC_D2 ", " CC_T3 "\n\t"
+
+#define PART_ROUND_EVEN_CDB(s, sr)                      \
+        "add    " CC_C2 ", " CC_C2 ", " CC_D3 "\n\t"    \
+        VADD_VV(REG_V2, REG_V2, REG_V3)                 \
+        "add    " CC_C3 ", " CC_C3 ", " CC_D0 "\n\t"    \
+        VADD_VV(REG_V6, REG_V6, REG_V7)                 \
+        "add    " CC_C0 ", " CC_C0 ", " CC_D1 "\n\t"    \
+        VADD_VV(REG_V10, REG_V10, REG_V11)              \
+        "add    " CC_C1 ", " CC_C1 ", " CC_D2 "\n\t"    \
+        VXOR_VV(REG_V1, REG_V1, REG_V2)                 \
+        "xor    " CC_B1 ", " CC_B1 ", " CC_C2 "\n\t"    \
+        VXOR_VV(REG_V5, REG_V5, REG_V6)                 \
+        "xor    " CC_B2 ", " CC_B2 ", " CC_C3 "\n\t"    \
+        VXOR_VV(REG_V9, REG_V9, REG_V10)                \
+        "xor    " CC_B3 ", " CC_B3 ", " CC_C0 "\n\t"    \
+        VROR_VI(REG_V1, sr, REG_V1)                     \
+        "xor    " CC_B0 ", " CC_B0 ", " CC_C1 "\n\t"    \
+        VROR_VI(REG_V5, sr, REG_V5)                     \
+        "slli   " CC_T0 ", " CC_B1 ", " #s "\n\t"       \
+        VROR_VI(REG_V9, sr, REG_V9)                     \
+        "slli   " CC_T1 ", " CC_B2 ", " #s "\n\t"       \
+        "slli   " CC_T2 ", " CC_B3 ", " #s "\n\t"       \
+        "slli   " CC_T3 ", " CC_B0 ", " #s "\n\t"       \
+        "srliw  " CC_B1 ", " CC_B1 ", " #sr "\n\t"      \
+        "srliw  " CC_B2 ", " CC_B2 ", " #sr "\n\t"      \
+        "srliw  " CC_B3 ", " CC_B3 ", " #sr "\n\t"      \
+        "srliw  " CC_B0 ", " CC_B0 ", " #sr "\n\t"      \
+        "or     " CC_B1 ", " CC_B1 ", " CC_T0 "\n\t"    \
+        "or     " CC_B2 ", " CC_B2 ", " CC_T1 "\n\t"    \
+        "or     " CC_B3 ", " CC_B3 ", " CC_T2 "\n\t"    \
+        "or     " CC_B0 ", " CC_B0 ", " CC_T3 "\n\t"
+
+#else
+
+#define PART_ROUND_ODD_ABD(s, sr)                       \
+        "add    " CC_A0 ", " CC_A0 ", " CC_B0 "\n\t"    \
+        VADD_VV(REG_V0, REG_V0, REG_V1)                 \
+        "add    " CC_A1 ", " CC_A1 ", " CC_B1 "\n\t"    \
+        VADD_VV(REG_V4, REG_V4, REG_V5)                 \
+        "add    " CC_A2 ", " CC_A2 ", " CC_B2 "\n\t"    \
+        VADD_VV(REG_V8, REG_V8, REG_V9)                 \
+        "add    " CC_A3 ", " CC_A3 ", " CC_B3 "\n\t"    \
+        VXOR_VV(REG_V3, REG_V3, REG_V0)                 \
+        "xor    " CC_D0 ", " CC_D0 ", " CC_A0 "\n\t"    \
+        VXOR_VV(REG_V7, REG_V7, REG_V4)                 \
+        "xor    " CC_D1 ", " CC_D1 ", " CC_A1 "\n\t"    \
+        VXOR_VV(REG_V11, REG_V11, REG_V8)               \
+        "xor    " CC_D2 ", " CC_D2 ", " CC_A2 "\n\t"    \
+        VROR_VI(REG_V3, sr, REG_V3)                     \
+        "xor    " CC_D3 ", " CC_D3 ", " CC_A3 "\n\t"    \
+        VROR_VI(REG_V7, sr, REG_V7)                     \
+        RORIW(REG_S6, REG_S6, sr)                       \
+        VROR_VI(REG_V11, sr, REG_V11)                   \
+        RORIW(REG_S7, REG_S7, sr)                       \
+        RORIW(REG_S8, REG_S8, sr)                       \
+        RORIW(REG_S9, REG_S9, sr)
+
+#define PART_ROUND_ODD_CDB(s, sr)                       \
+        "add    " CC_C0 ", " CC_C0 ", " CC_D0 "\n\t"    \
+        VADD_VV(REG_V2, REG_V2, REG_V3)                 \
+        "add    " CC_C1 ", " CC_C1 ", " CC_D1 "\n\t"    \
+        VADD_VV(REG_V6, REG_V6, REG_V7)                 \
+        "add    " CC_C2 ", " CC_C2 ", " CC_D2 "\n\t"    \
+        VADD_VV(REG_V10, REG_V10, REG_V11)              \
+        "add    " CC_C3 ", " CC_C3 ", " CC_D3 "\n\t"    \
+        VXOR_VV(REG_V1, REG_V1, REG_V2)                 \
+        "xor    " CC_B0 ", " CC_B0 ", " CC_C0 "\n\t"    \
+        VXOR_VV(REG_V5, REG_V5, REG_V6)                 \
+        "xor    " CC_B1 ", " CC_B1 ", " CC_C1 "\n\t"    \
+        VXOR_VV(REG_V9, REG_V9, REG_V10)                \
+        "xor    " CC_B2 ", " CC_B2 ", " CC_C2 "\n\t"    \
+        VROR_VI(REG_V1, sr, REG_V1)                     \
+        "xor    " CC_B3 ", " CC_B3 ", " CC_C3 "\n\t"    \
+        VROR_VI(REG_V5, sr, REG_V5)                     \
+        RORIW(REG_T3, REG_T3, sr)                       \
+        VROR_VI(REG_V9, sr, REG_V9)                     \
+        RORIW(REG_T4, REG_T4, sr)                       \
+        RORIW(REG_T5, REG_T5, sr)                       \
+        RORIW(REG_T6, REG_T6, sr)
+
+#define PART_ROUND_EVEN_ABD(s, sr)                      \
+        "add    " CC_A0 ", " CC_A0 ", " CC_B1 "\n\t"    \
+        VADD_VV(REG_V0, REG_V0, REG_V1)                 \
+        "add    " CC_A1 ", " CC_A1 ", " CC_B2 "\n\t"    \
+        VADD_VV(REG_V4, REG_V4, REG_V5)                 \
+        "add    " CC_A2 ", " CC_A2 ", " CC_B3 "\n\t"    \
+        VADD_VV(REG_V8, REG_V8, REG_V9)                 \
+        "add    " CC_A3 ", " CC_A3 ", " CC_B0 "\n\t"    \
+        VXOR_VV(REG_V3, REG_V3, REG_V0)                 \
+        "xor    " CC_D3 ", " CC_D3 ", " CC_A0 "\n\t"    \
+        VXOR_VV(REG_V7, REG_V7, REG_V4)                 \
+        "xor    " CC_D0 ", " CC_D0 ", " CC_A1 "\n\t"    \
+        VXOR_VV(REG_V11, REG_V11, REG_V8)               \
+        "xor    " CC_D1 ", " CC_D1 ", " CC_A2 "\n\t"    \
+        VROR_VI(REG_V3, sr, REG_V3)                     \
+        "xor    " CC_D2 ", " CC_D2 ", " CC_A3 "\n\t"    \
+        VROR_VI(REG_V7, sr, REG_V7)                     \
+        RORIW(REG_S9, REG_S9, sr)                       \
+        VROR_VI(REG_V11, sr, REG_V11)                   \
+        RORIW(REG_S6, REG_S6, sr)                       \
+        RORIW(REG_S7, REG_S7, sr)                       \
+        RORIW(REG_S8, REG_S8, sr)
+
+#define PART_ROUND_EVEN_CDB(s, sr)                      \
+        "add    " CC_C2 ", " CC_C2 ", " CC_D3 "\n\t"    \
+        VADD_VV(REG_V2, REG_V2, REG_V3)                 \
+        "add    " CC_C3 ", " CC_C3 ", " CC_D0 "\n\t"    \
+        VADD_VV(REG_V6, REG_V6, REG_V7)                 \
+        "add    " CC_C0 ", " CC_C0 ", " CC_D1 "\n\t"    \
+        VADD_VV(REG_V10, REG_V10, REG_V11)              \
+        "add    " CC_C1 ", " CC_C1 ", " CC_D2 "\n\t"    \
+        VXOR_VV(REG_V1, REG_V1, REG_V2)                 \
+        "xor    " CC_B1 ", " CC_B1 ", " CC_C2 "\n\t"    \
+        VXOR_VV(REG_V5, REG_V5, REG_V6)                 \
+        "xor    " CC_B2 ", " CC_B2 ", " CC_C3 "\n\t"    \
+        VXOR_VV(REG_V9, REG_V9, REG_V10)                \
+        "xor    " CC_B3 ", " CC_B3 ", " CC_C0 "\n\t"    \
+        VROR_VI(REG_V1, sr, REG_V1)                     \
+        "xor    " CC_B0 ", " CC_B0 ", " CC_C1 "\n\t"    \
+        VROR_VI(REG_V5, sr, REG_V5)                     \
+        "slli   " CC_T0 ", " CC_B1 ", " #s "\n\t"       \
+        RORIW(REG_T4, REG_T4, sr)                       \
+        VROR_VI(REG_V9, sr, REG_V9)                     \
+        RORIW(REG_T5, REG_T5, sr)                       \
+        RORIW(REG_T6, REG_T6, sr)                       \
+        RORIW(REG_T3, REG_T3, sr)
+
+#endif
+
+#define QUARTER_ROUND_ODD_4()               \
+        /* a += b; d ^= a; d <<<= 16; */    \
+        PART_ROUND_ODD_ABD(16, 16)          \
+        /* c += d; b ^= c; b <<<= 12; */    \
+        PART_ROUND_ODD_CDB(12, 20)          \
+        /* a += b; d ^= a; d <<<= 8; */     \
+        PART_ROUND_ODD_ABD( 8, 24)          \
+        /* c += d; b ^= c; b <<<= 7; */     \
+        PART_ROUND_ODD_CDB( 7, 25)
+
+#define QUARTER_ROUND_EVEN_4()              \
+        /* a += b; d ^= a; d <<<= 16; */    \
+        PART_ROUND_EVEN_ABD(16, 16)         \
+        /* c += d; b ^= c; b <<<= 12; */    \
+        PART_ROUND_EVEN_CDB(12, 20)         \
+        /* a += b; d ^= a; d <<<= 8; */     \
+        PART_ROUND_EVEN_ABD( 8, 24)         \
+        /* c += d; b ^= c; b <<<= 7; */     \
+        PART_ROUND_EVEN_CDB( 7, 25)
+
+#define SHUFFLE_4(r, t, i)                  \
+        VRGATHER_VV(t + 0, i, r + 0)        \
+        VRGATHER_VV(t + 1, i, r + 4)        \
+        VRGATHER_VV(t + 2, i, r + 8)        \
+        VMV_V_V(r + 0, t + 0)               \
+        VMV_V_V(r + 4, t + 1)               \
+        VMV_V_V(r + 8, t + 2)
+
+#define ODD_SHUFFLE_4()                                                 \
+        /*    a=0,1,2,3; b=4,5,6,7; c=8,9,10,11; d=12,13,14,15          \
+         * => a=0,1,2,3; b=5,6,7,4; c=10,11,8,9; d=15,12,13,14 */       \
+        SHUFFLE_4(REG_V3, REG_V20, REG_V25)                             \
+        SHUFFLE_4(REG_V1, REG_V20, REG_V23)                             \
+        SHUFFLE_4(REG_V2, REG_V20, REG_V24)
+
+#define EVEN_SHUFFLE_4()                                                \
+        /*    a=0,1,2,3; b=5,6,7,4; c=10,11,8,9; d=15,12,13,14          \
+         * => a=0,1,2,3; b=4,5,6,7; c=8,9,10,11; d=12,13,14,15 */       \
+        SHUFFLE_4(REG_V3, REG_V20, REG_V23)                             \
+        SHUFFLE_4(REG_V1, REG_V20, REG_V25)                             \
+        SHUFFLE_4(REG_V2, REG_V20, REG_V24)
+
+/**
+  * Converts word into bytes with rotations having been done.
+  */
+static WC_INLINE int wc_chacha_encrypt_256(const word32* input, const byte* m,
+    byte* c)
+{
+    __asm__ __volatile__ (
+        VSETIVLI(REG_X0, 4, 1, 1, 0b010, 0b000)
+        /* The layout of used vector registers is:
+         * v0-v3 - first block
+         * v4-v7 - second block
+         * v8-v11 - third block
+         * v12-v15 - message
+         * v16-v19 - input
+         * v20-v22 - temp
+         * v23-v25 - indeces for rotating words in vector
+         *
+         * v0  0  1  2  3
+         * v1  4  5  6  7
+         * v2  8  9 10 11
+         * v3 12 13 14 15
+         * load CHACHA state with indices placed as shown above
+         */
+
+        /* Load state to encrypt */
+        "mv     t2, %[input]\n\t"
+        VL4RE32_V(REG_V16, REG_T2)
+        VID_V(REG_V20)
+        VSLIDEDOWN_VI(REG_V23, REG_V20, 1)
+        VSLIDEUP_VI(REG_V23, REG_V20, 3)
+        VSLIDEDOWN_VI(REG_V24, REG_V20, 2)
+        VSLIDEUP_VI(REG_V24, REG_V20, 2)
+        VSLIDEDOWN_VI(REG_V25, REG_V20, 3)
+        VSLIDEUP_VI(REG_V25, REG_V20, 1)
+        /* Move state into regular registers */
+        "ld     a4,  0(%[input])\n\t"
+        "ld     a6,  8(%[input])\n\t"
+        "ld     t3, 16(%[input])\n\t"
+        "ld     t5, 24(%[input])\n\t"
+        "ld     s2, 32(%[input])\n\t"
+        "ld     s4, 40(%[input])\n\t"
+        "ld     s6, 48(%[input])\n\t"
+        "ld     s8, 56(%[input])\n\t"
+        "srli   a5, a4, 32\n\t"
+        "srli   a7, a6, 32\n\t"
+        "srli   t4, t3, 32\n\t"
+        "srli   t6, t5, 32\n\t"
+        "srli   s3, s2, 32\n\t"
+        "srli   s5, s4, 32\n\t"
+        "srli   s7, s6, 32\n\t"
+        "srli   s9, s8, 32\n\t"
+        /* Move state into vector registers */
+        VMVR_V(REG_V0, REG_V16, 4)
+        "addi   t0, s6, 1\n\t"
+        VMVR_V(REG_V4, REG_V16, 4)
+        "addi   t1, s6, 2\n\t"
+        VMVR_V(REG_V8, REG_V16, 4)
+        "addi   s6, s6, 3\n\t"
+        /* Set counter word */
+        VMV_S_X(REG_V7, REG_T0)
+        VMV_S_X(REG_V11, REG_T1)
+        /* Set number of odd+even rounds to perform */
+        "li     a3, 10\n\t"
+        "\n"
+    "L_chacha20_riscv_256_loop:\n\t"
+        /* Odd Round */
+        QUARTER_ROUND_ODD_4()
+        ODD_SHUFFLE_4()
+        /* Even Round */
+        QUARTER_ROUND_EVEN_4()
+        EVEN_SHUFFLE_4()
+        "addi   a3, a3, -1\n\t"
+        "bnez   a3, L_chacha20_riscv_256_loop\n\t"
+        /* Load message */
+        "mv     t2, %[m]\n\t"
+        VL4RE32_V(REG_V12, REG_T2)
+        "addi   %[m], %[m], 64\n\t"
+        /* Add back state, XOR in message and store (load next block) */
+        /* BLOCK 1 */
+        VADD_VV(REG_V0, REG_V0, REG_V16)
+        VADD_VV(REG_V1, REG_V1, REG_V17)
+        VADD_VV(REG_V2, REG_V2, REG_V18)
+        VADD_VV(REG_V3, REG_V3, REG_V19)
+        VXOR_VV(REG_V0, REG_V0, REG_V12)
+        VXOR_VV(REG_V1, REG_V1, REG_V13)
+        VXOR_VV(REG_V2, REG_V2, REG_V14)
+        VXOR_VV(REG_V3, REG_V3, REG_V15)
+        "mv     t2, %[m]\n\t"
+        VL4RE32_V(REG_V12, REG_T2)
+        "addi   %[m], %[m], 64\n\t"
+        VMV_X_S(REG_T0, REG_V19)
+        "mv     t2, %[c]\n\t"
+        VS4R_V(REG_V0, REG_T2)
+        "addi   %[c], %[c], 64\n\t"
+        /* BLOCK 2 */
+        "addi   t0, t0, 1\n\t"
+        VMV_S_X(REG_V19, REG_T0)
+        VADD_VV(REG_V4, REG_V4, REG_V16)
+        VADD_VV(REG_V5, REG_V5, REG_V17)
+        VADD_VV(REG_V6, REG_V6, REG_V18)
+        VADD_VV(REG_V7, REG_V7, REG_V19)
+        VXOR_VV(REG_V4, REG_V4, REG_V12)
+        VXOR_VV(REG_V5, REG_V5, REG_V13)
+        VXOR_VV(REG_V6, REG_V6, REG_V14)
+        VXOR_VV(REG_V7, REG_V7, REG_V15)
+        "mv     t2, %[m]\n\t"
+        VL4RE32_V(REG_V12, REG_T2)
+        "addi   %[m], %[m], 64\n\t"
+        "mv     t2, %[c]\n\t"
+        VS4R_V(REG_V4, REG_T2)
+        "addi   %[c], %[c], 64\n\t"
+        /* BLOCK 3 */
+        "addi   t0, t0, 1\n\t"
+        VMV_S_X(REG_V19, REG_T0)
+        VADD_VV(REG_V8, REG_V8, REG_V16)
+        VADD_VV(REG_V9, REG_V9, REG_V17)
+        VADD_VV(REG_V10, REG_V10, REG_V18)
+        VADD_VV(REG_V11, REG_V11, REG_V19)
+        VXOR_VV(REG_V8, REG_V8, REG_V12)
+        VXOR_VV(REG_V9, REG_V9, REG_V13)
+        VXOR_VV(REG_V10, REG_V10, REG_V14)
+        VXOR_VV(REG_V11, REG_V11, REG_V15)
+        "mv     t2, %[m]\n\t"
+        VL4RE32_V(REG_V12, REG_T2)
+        "mv     t2, %[c]\n\t"
+        VS4R_V(REG_V8, REG_T2)
+        "addi   %[c], %[c], 64\n\t"
+        /* BLOCK 4 */
+        /* Move regular registers into vector registers for adding and xor */
+        "addi   t0, t0, 1\n\t"
+        VMV_S_X(REG_V0, REG_A4)
+        VMV_S_X(REG_V1, REG_T3)
+        VMV_S_X(REG_V2, REG_S2)
+        VMV_S_X(REG_V3, REG_S6)
+        VMV_S_X(REG_V4, REG_A5)
+        VMV_S_X(REG_V5, REG_T4)
+        VMV_S_X(REG_V6, REG_S3)
+        VMV_S_X(REG_V7, REG_S7)
+        VSLIDEUP_VI(REG_V0, REG_V4, 1)
+        VSLIDEUP_VI(REG_V1, REG_V5, 1)
+        VSLIDEUP_VI(REG_V2, REG_V6, 1)
+        VSLIDEUP_VI(REG_V3, REG_V7, 1)
+        VMV_S_X(REG_V4, REG_A6)
+        VMV_S_X(REG_V5, REG_T5)
+        VMV_S_X(REG_V6, REG_S4)
+        VMV_S_X(REG_V7, REG_S8)
+        VSLIDEUP_VI(REG_V0, REG_V4, 2)
+        VSLIDEUP_VI(REG_V1, REG_V5, 2)
+        VSLIDEUP_VI(REG_V2, REG_V6, 2)
+        VSLIDEUP_VI(REG_V3, REG_V7, 2)
+        VMV_S_X(REG_V4, REG_A7)
+        VMV_S_X(REG_V5, REG_T6)
+        VMV_S_X(REG_V6, REG_S5)
+        VMV_S_X(REG_V7, REG_S9)
+        VSLIDEUP_VI(REG_V0, REG_V4, 3)
+        VSLIDEUP_VI(REG_V1, REG_V5, 3)
+        VSLIDEUP_VI(REG_V2, REG_V6, 3)
+        VSLIDEUP_VI(REG_V3, REG_V7, 3)
+        VMV_S_X(REG_V19, REG_T0)
+        /* Add back state, XOR in message and store */
+        VADD_VV(REG_V0, REG_V0, REG_V16)
+        VADD_VV(REG_V1, REG_V1, REG_V17)
+        VADD_VV(REG_V2, REG_V2, REG_V18)
+        VADD_VV(REG_V3, REG_V3, REG_V19)
+        VXOR_VV(REG_V0, REG_V0, REG_V12)
+        VXOR_VV(REG_V1, REG_V1, REG_V13)
+        VXOR_VV(REG_V2, REG_V2, REG_V14)
+        VXOR_VV(REG_V3, REG_V3, REG_V15)
+        "mv     t2, %[c]\n\t"
+        VS4R_V(REG_V0, REG_T2)
+        : [m] "+r" (m), [c] "+r" (c)
+        : [input] "r" (input)
+        : "memory", "t0", "t1", "t2", "s1", "a3",
+          "t3", "t4", "t5", "t6",
+          "a4", "a5", "a6", "a7",
+          "s2", "s3", "s4", "s5",
+          "s6", "s7", "s8", "s9"
+    );
+    return CHACHA_CHUNK_BYTES * 4;
+}
+
+#ifndef WOLFSSL_RISCV_VECTOR_BASE_BIT_MANIPULATION 
+
+#define PART_ROUND_2(a, b, d, t, a2, b2, d2, t2, sl, sr)    \
+        VADD_VV(a, a, b)                                    \
+        VADD_VV(a2, a2, b2)                                 \
+        VXOR_VV(d, d, a)                                    \
+        VXOR_VV(d2, d2, a2)                                 \
+        VSLL_VI(t, d, sl)                                   \
+        VSLL_VI(t2, d2, sl)                                 \
+        VSRL_VI(d, d, sr)                                   \
+        VSRL_VI(d2, d2, sr)                                 \
+        VOR_VV(d, d, t)                                     \
+        VOR_VV(d2, d2, t2)
+
+#else
+
+#define PART_ROUND_2(a, b, d, t, a2, b2, d2, t2, sl, sr)    \
+        VADD_VV(a, a, b)                                    \
+        VADD_VV(a2, a2, b2)                                 \
+        VXOR_VV(d, d, a)                                    \
+        VXOR_VV(d2, d2, a2)                                 \
+        VROR_VI(d, sr, d)                                   \
+        VROR_VI(d2, sr, d2)
+
+#endif
+
+#define QUARTER_ROUND_2(a, b, c, d, t, a2, b2, c2, d2, t2)  \
+        /* a += b; d ^= a; d <<<= 16; */                    \
+        PART_ROUND_2(a, b, d, t, a2, b2, d2, t2, 16, 16)    \
+        /* c += d; b ^= c; b <<<= 12; */                    \
+        PART_ROUND_2(c, d, b, t, c2, d2, b2, t2, 12, 20)    \
+        /* a += b; d ^= a; d <<<= 8; */     \
+        PART_ROUND_2(a, b, d, t, a2, b2, d2, t2,  8, 24)    \
+        /* c += d; b ^= c; b <<<= 7; */     \
+        PART_ROUND_2(c, d, b, t, c2, d2, b2, t2,  7, 25)
+
+#define ODD_SHUFFLE_2(b, c, d, t, b2, c2, d2, t2)                       \
+        /*    a=0,1,2,3; b=4,5,6,7; c=8,9,10,11; d=12,13,14,15          \
+         * => a=0,1,2,3; b=5,6,7,4; c=10,11,8,9; d=15,12,13,14 */       \
+        VRGATHER_VV(t, REG_V25, d)                                      \
+        VRGATHER_VV(t2, REG_V25, d2)                                    \
+        VMV_V_V(d, t)                                                   \
+        VMV_V_V(d2, t2)                                                 \
+        VRGATHER_VV(t, REG_V23, b)                                      \
+        VRGATHER_VV(t2, REG_V23, b2)                                    \
+        VMV_V_V(b, t)                                                   \
+        VMV_V_V(b2, t2)                                                 \
+        VRGATHER_VV(t, REG_V24, c)                                      \
+        VRGATHER_VV(t2, REG_V24, c2)                                    \
+        VMV_V_V(c, t)                                                   \
+        VMV_V_V(c2, t2)
+
+#define EVEN_SHUFFLE_2(b, c, d, t, b2, c2, d2, t2)                      \
+        /*    a=0,1,2,3; b=5,6,7,4; c=10,11,8,9; d=15,12,13,14          \
+         * => a=0,1,2,3; b=4,5,6,7; c=8,9,10,11; d=12,13,14,15 */       \
+        VRGATHER_VV(t, REG_V23, d)                                      \
+        VRGATHER_VV(t2, REG_V23, d2)                                    \
+        VMV_V_V(d, t)                                                   \
+        VMV_V_V(d2, t2)                                                 \
+        VRGATHER_VV(t, REG_V25, b)                                      \
+        VRGATHER_VV(t2, REG_V25, b2)                                    \
+        VMV_V_V(b, t)                                                   \
+        VMV_V_V(b2, t2)                                                 \
+        VRGATHER_VV(t, REG_V24, c)                                      \
+        VRGATHER_VV(t2, REG_V24, c2)                                    \
+        VMV_V_V(c, t)                                                   \
+        VMV_V_V(c2, t2)
+
+
+static WC_INLINE int wc_chacha_encrypt_128(const word32* input, const byte* m,
+     byte* c)
+{
+    __asm__ __volatile__ (
+        VSETIVLI(REG_X0, 4, 1, 1, 0b010, 0b000)
+        /* The layout of used vector registers is:
+         * v0-v3 - first block
+         * v4-v7 - second block
+         * v12-v15 - message
+         * v16-v19 - input
+         * v20-v22 - temp
+         * v23-v25 - indeces for rotating words in vector
+         *
+         * v0  0  1  2  3
+         * v1  4  5  6  7
+         * v2  8  9 10 11
+         * v3 12 13 14 15
+         * load CHACHA state with indices placed as shown above
+         */
+
+        /* Load incrementer register to modify counter */
+        "mv     t2, %[L_chacha20_vec_inc_first_word]\n\t"
+        VL1RE32_V(REG_V22, REG_T2)
+        VID_V(REG_V20)
+        VSLIDEDOWN_VI(REG_V23, REG_V20, 1)
+        VSLIDEUP_VI(REG_V23, REG_V20, 3)
+        VSLIDEDOWN_VI(REG_V24, REG_V20, 2)
+        VSLIDEUP_VI(REG_V24, REG_V20, 2)
+        VSLIDEDOWN_VI(REG_V25, REG_V20, 3)
+        VSLIDEUP_VI(REG_V25, REG_V20, 1)
+        /* Load state to encrypt */
+        "mv     t2, %[input]\n\t"
+        VL4RE32_V(REG_V16, REG_T2)
+        /* Load message */
+        "mv     t2, %[m]\n\t"
+        VL4RE32_V(REG_V12, REG_T2)
+        "addi   %[m], %[m], 64\n\t"
+        /* Move state into vector registers */
+        VMVR_V(REG_V0, REG_V16, 4)
+        VMVR_V(REG_V4, REG_V16, 4)
+        /* Add counter word */
+        VADD_VV(REG_V7, REG_V7, REG_V22)
+        /* Set number of odd+even rounds to perform */
+        "li     t0, 10\n\t"
+        "\n"
+    "L_chacha20_riscv_128_loop:\n\t"
+        QUARTER_ROUND_2(REG_V0, REG_V1, REG_V2, REG_V3, REG_V20,
+                        REG_V4, REG_V5, REG_V6, REG_V7, REG_V21)
+        ODD_SHUFFLE_2(REG_V1, REG_V2, REG_V3, REG_V20,
+                      REG_V5, REG_V6, REG_V7, REG_V21)
+        QUARTER_ROUND_2(REG_V0, REG_V1, REG_V2, REG_V3, REG_V20,
+                        REG_V4, REG_V5, REG_V6, REG_V7, REG_V21)
+        EVEN_SHUFFLE_2(REG_V1, REG_V2, REG_V3, REG_V20,
+                       REG_V5, REG_V6, REG_V7, REG_V21)
+        "addi   t0, t0, -1\n\t"
+        "bnez   t0, L_chacha20_riscv_128_loop\n\t"
+        /* Add back state, XOR in message and store (load next block) */
+        VADD_VV(REG_V0, REG_V0, REG_V16)
+        VADD_VV(REG_V1, REG_V1, REG_V17)
+        VADD_VV(REG_V2, REG_V2, REG_V18)
+        VADD_VV(REG_V3, REG_V3, REG_V19)
+        VXOR_VV(REG_V0, REG_V0, REG_V12)
+        VXOR_VV(REG_V1, REG_V1, REG_V13)
+        VXOR_VV(REG_V2, REG_V2, REG_V14)
+        VXOR_VV(REG_V3, REG_V3, REG_V15)
+        "mv     t2, %[m]\n\t"
+        VL4RE32_V(REG_V12, REG_T2)
+        "mv     t2, %[c]\n\t"
+        VS4R_V(REG_V0, REG_T2)
+        "addi   %[c], %[c], 64\n\t"
+        VADD_VV(REG_V19, REG_V19, REG_V22)
+        VADD_VV(REG_V4, REG_V4, REG_V16)
+        VADD_VV(REG_V5, REG_V5, REG_V17)
+        VADD_VV(REG_V6, REG_V6, REG_V18)
+        VADD_VV(REG_V7, REG_V7, REG_V19)
+        VXOR_VV(REG_V4, REG_V4, REG_V12)
+        VXOR_VV(REG_V5, REG_V5, REG_V13)
+        VXOR_VV(REG_V6, REG_V6, REG_V14)
+        VXOR_VV(REG_V7, REG_V7, REG_V15)
+        "mv     t2, %[c]\n\t"
+        VS4R_V(REG_V4, REG_T2)
+        : [m] "+r" (m), [c] "+r" (c)
+        : [input] "r" (input),
+          [L_chacha20_vec_inc_first_word] "r" (L_chacha20_vec_inc_first_word)
+        : "memory", "t0", "t1", "t2"
+    );
+    return CHACHA_CHUNK_BYTES * 2;
+}
+
+#ifndef WOLFSSL_RISCV_VECTOR_BASE_BIT_MANIPULATION 
+
+#define PART_ROUND(a, b, d, t, sl, sr)      \
+        VADD_VV(a, a, b)                    \
+        VXOR_VV(d, d, a)                    \
+        VSLL_VI(t, d, sl)                   \
+        VSRL_VI(d, d, sr)                   \
+        VOR_VV(d, d, t)
+
+#else
+
+#define PART_ROUND(a, b, d, t, sl, sr)      \
+        VADD_VV(a, a, b)                    \
+        VXOR_VV(d, d, a)                    \
+        VROR_VI(d, sr, d)
+
+#endif
+
+#define QUARTER_ROUND(a, b, c, d, t)        \
+        /* a += b; d ^= a; d <<<= 16; */    \
+        PART_ROUND(a, b, d, t, 16, 16)      \
+        /* c += d; b ^= c; b <<<= 12; */    \
+        PART_ROUND(c, d, b, t, 12, 20)      \
+        /* a += b; d ^= a; d <<<= 8; */     \
+        PART_ROUND(a, b, d, t,  8, 24)      \
+        /* c += d; b ^= c; b <<<= 7; */     \
+        PART_ROUND(c, d, b, t,  7, 25)
+
+#define ODD_SHUFFLE(b, c, d, t)                                         \
+        /*    a=0,1,2,3; b=4,5,6,7; c=8,9,10,11; d=12,13,14,15          \
+         * => a=0,1,2,3; b=5,6,7,4; c=10,11,8,9; d=15,12,13,14 */       \
+        VSLIDEDOWN_VI(t, d, 3)                                          \
+        VSLIDEUP_VI(t, d, 1)                                            \
+        VMV_V_V(d, t)                                                   \
+        VSLIDEDOWN_VI(t, b, 1)                                          \
+        VSLIDEUP_VI(t, b, 3)                                            \
+        VMV_V_V(b, t)                                                   \
+        VSLIDEDOWN_VI(t, c, 2)                                          \
+        VSLIDEUP_VI(t, c, 2)                                            \
+        VMV_V_V(c, t)
+
+#define EVEN_SHUFFLE(b, c, d, t)                                        \
+        /*    a=0,1,2,3; b=5,6,7,4; c=10,11,8,9; d=15,12,13,14          \
+         * => a=0,1,2,3; b=4,5,6,7; c=8,9,10,11; d=12,13,14,15 */       \
+        VSLIDEDOWN_VI(t, d, 1)                                          \
+        VSLIDEUP_VI(t, d, 3)                                            \
+        VMV_V_V(d, t)                                                   \
+        VSLIDEDOWN_VI(t, b, 3)                                          \
+        VSLIDEUP_VI(t, b, 1)                                            \
+        VMV_V_V(b, t)                                                   \
+        VSLIDEDOWN_VI(t, c, 2)                                          \
+        VSLIDEUP_VI(t, c, 2)                                            \
+        VMV_V_V(c, t)
+
+#define EIGHT_QUARTER_ROUNDS(a, b, c, d, t) \
+        /* Odd Round */                     \
+        QUARTER_ROUND(a, b, c, d, t)        \
+        ODD_SHUFFLE(b, c, d, t)             \
+        /* Even Round */                    \
+        QUARTER_ROUND(a, b, c, d, t)        \
+        EVEN_SHUFFLE(b, c, d, t)
+
+static WC_INLINE void wc_chacha_encrypt_64(const word32* input, const byte* m,
+    byte* c, word32 bytes, byte* over)
+{
+    word64 bytes64 = (word64)bytes;
+
+    __asm__ __volatile__ (
+        VSETIVLI(REG_X0, 4, 1, 1, 0b010, 0b000)
+        /* The layout of used vector registers is:
+         * v0-v3 - block
+         * v4-v7 - message
+         * v8-v11 - input
+         * v12 - temp
+         *
+         * v0  0  1  2  3
+         * v1  4  5  6  7
+         * v2  8  9 10 11
+         * v3 12 13 14 15
+         * load CHACHA state with indices placed as shown above
+         */
+
+        /* Load incrementer register to modify counter */
+        "mv     t2, %[L_chacha20_vec_inc_first_word]\n\t"
+        VL1RE32_V(REG_V13, REG_T2)
+        /* Load state to encrypt */
+        "mv     t2, %[input]\n\t"
+        VL4RE32_V(REG_V8, REG_T2)
+        "\n"
+    "L_chacha20_riscv_64_loop:\n\t"
+        /* Move state into vector registers */
+        VMVR_V(REG_V0, REG_V8, 4)
+        /* Add counter word */
+        /* Odd Round */
+        EIGHT_QUARTER_ROUNDS(REG_V0, REG_V1, REG_V2, REG_V3, REG_V12)
+        EIGHT_QUARTER_ROUNDS(REG_V0, REG_V1, REG_V2, REG_V3, REG_V12)
+        EIGHT_QUARTER_ROUNDS(REG_V0, REG_V1, REG_V2, REG_V3, REG_V12)
+        EIGHT_QUARTER_ROUNDS(REG_V0, REG_V1, REG_V2, REG_V3, REG_V12)
+        EIGHT_QUARTER_ROUNDS(REG_V0, REG_V1, REG_V2, REG_V3, REG_V12)
+        EIGHT_QUARTER_ROUNDS(REG_V0, REG_V1, REG_V2, REG_V3, REG_V12)
+        EIGHT_QUARTER_ROUNDS(REG_V0, REG_V1, REG_V2, REG_V3, REG_V12)
+        EIGHT_QUARTER_ROUNDS(REG_V0, REG_V1, REG_V2, REG_V3, REG_V12)
+        EIGHT_QUARTER_ROUNDS(REG_V0, REG_V1, REG_V2, REG_V3, REG_V12)
+        EIGHT_QUARTER_ROUNDS(REG_V0, REG_V1, REG_V2, REG_V3, REG_V12)
+        /* Add back state */
+        VADD_VV(REG_V0, REG_V0, REG_V8)
+        VADD_VV(REG_V1, REG_V1, REG_V9)
+        VADD_VV(REG_V2, REG_V2, REG_V10)
+        VADD_VV(REG_V3, REG_V3, REG_V11)
+        "addi   t2, %[bytes], -64\n\t"
+        "bltz   t2, L_chacha20_riscv_64_lt_64\n\t"
+        "mv     t2, %[m]\n\t"
+        VL4RE32_V(REG_V4, REG_T2)
+        VXOR_VV(REG_V4, REG_V4, REG_V0)
+        VXOR_VV(REG_V5, REG_V5, REG_V1)
+        VXOR_VV(REG_V6, REG_V6, REG_V2)
+        VXOR_VV(REG_V7, REG_V7, REG_V3)
+        "mv     t2, %[c]\n\t"
+        VS4R_V(REG_V4, REG_T2)
+        "addi   %[c], %[c], 64\n\t"
+        "addi   %[m], %[m], 64\n\t"
+        "addi   %[bytes], %[bytes], -64\n\t"
+        VADD_VV(REG_V11, REG_V11, REG_V13)
+        "bnez   %[bytes], L_chacha20_riscv_64_loop\n\t"
+        "beqz   %[bytes], L_chacha20_riscv_64_done\n\t"
+        "\n"
+    "L_chacha20_riscv_64_lt_64:\n\t"
+        "mv     t2, %[over]\n\t"
+        VS4R_V(REG_V0, REG_T2)
+
+        "addi   t2, %[bytes], -32\n\t"
+        "bltz   t2, L_chacha20_riscv_64_lt_32\n\t"
+        "mv     t2, %[m]\n\t"
+        VL2RE32_V(REG_V4, REG_T2)
+        VXOR_VV(REG_V4, REG_V4, REG_V0)
+        VXOR_VV(REG_V5, REG_V5, REG_V1)
+        "mv     t2, %[c]\n\t"
+        VS2R_V(REG_V4, REG_T2)
+        "addi   %[c], %[c], 32\n\t"
+        "addi   %[m], %[m], 32\n\t"
+        "addi   %[bytes], %[bytes], -32\n\t"
+        "beqz   %[bytes], L_chacha20_riscv_64_done\n\t"
+        VMVR_V(REG_V0, REG_V2, 2)
+        "\n"
+    "L_chacha20_riscv_64_lt_32:\n\t"
+        "addi   t2, %[bytes], -16\n\t"
+        "bltz   t2, L_chacha20_riscv_64_lt_16\n\t"
+        "mv     t2, %[m]\n\t"
+        VL1RE32_V(REG_V4, REG_T2)
+        VXOR_VV(REG_V4, REG_V4, REG_V0)
+        "mv     t2, %[c]\n\t"
+        VS1R_V(REG_V4, REG_T2)
+        "addi   %[c], %[c], 16\n\t"
+        "addi   %[m], %[m], 16\n\t"
+        "addi   %[bytes], %[bytes], -16\n\t"
+        "beqz   %[bytes], L_chacha20_riscv_64_done\n\t"
+        VMV_V_V(REG_V0, REG_V1)
+        "\n"
+    "L_chacha20_riscv_64_lt_16:\n\t"
+        "addi   t2, %[bytes], -8\n\t"
+        "bltz   t2, L_chacha20_riscv_64_lt_8\n\t"
+        VSETIVLI(REG_X0, 2, 1, 1, 0b011, 0b000)
+        VMV_X_S(REG_T0, REG_V0)
+        VSETIVLI(REG_X0, 4, 1, 1, 0b010, 0b000)
+        "ld     t1, (%[m])\n\t"
+        "xor    t1, t1, t0\n\t"
+        "sd     t1, (%[c])\n\t"
+        "addi   %[c], %[c], 8\n\t"
+        "addi   %[m], %[m], 8\n\t"
+        "addi   %[bytes], %[bytes], -8\n\t"
+        "beqz   %[bytes], L_chacha20_riscv_64_done\n\t"
+        VSLIDEDOWN_VI(REG_V0, REG_V0, 2)
+        "\n"
+    "L_chacha20_riscv_64_lt_8:\n\t"
+        VSETIVLI(REG_X0, 2, 1, 1, 0b011, 0b000)
+        VMV_X_S(REG_T0, REG_V0)
+        VSETIVLI(REG_X0, 4, 1, 1, 0b010, 0b000)
+        "addi   %[bytes], %[bytes], -1\n\t"
+        "\n"
+    "L_chacha20_riscv_64_loop_lt_8:\n\t"
+        "lb     t1, (%[m])\n\t"
+        "addi   %[m], %[m], 1\n\t"
+        "xor    t1, t1, t0\n\t"
+        "sb     t1, (%[c])\n\t"
+        "addi   %[c], %[c], 1\n\t"
+        "addi   %[bytes], %[bytes], -1\n\t"
+        "srli   t0, t0, 8\n\t"
+        "bgez   %[bytes], L_chacha20_riscv_64_loop_lt_8\n\t"
+        "\n"
+    "L_chacha20_riscv_64_done:\n\t"
+        : [m] "+r" (m), [c] "+r" (c), [bytes] "+r" (bytes64)
+        : [input] "r" (input), [over] "r" (over),
+          [L_chacha20_vec_inc_first_word] "r" (L_chacha20_vec_inc_first_word)
+        : "memory", "t0", "t1", "t2"
+    );
+}
+
+/**
+ * Encrypt a stream of bytes
+ */
+static void wc_chacha_encrypt_bytes(ChaCha* ctx, const byte* m, byte* c,
+    word32 bytes)
+{
+    int    processed;
+
+    if (bytes >= CHACHA_CHUNK_BYTES * 6) {
+        processed = (bytes / (CHACHA_CHUNK_BYTES * 6)) * CHACHA_CHUNK_BYTES * 6;
+        wc_chacha_encrypt_384(ctx->X, m, c, processed);
+
+        bytes -= processed;
+        c += processed;
+        m += processed;
+        ctx->X[CHACHA_IV_BYTES] = PLUS(ctx->X[CHACHA_IV_BYTES],
+                                       processed / CHACHA_CHUNK_BYTES);
+    }
+    if (bytes >= CHACHA_CHUNK_BYTES * 4) {
+        processed = wc_chacha_encrypt_256(ctx->X, m, c);
+
+        bytes -= processed;
+        c += processed;
+        m += processed;
+        ctx->X[CHACHA_IV_BYTES] = PLUS(ctx->X[CHACHA_IV_BYTES],
+                                       processed / CHACHA_CHUNK_BYTES);
+    }
+    if (bytes >= CHACHA_CHUNK_BYTES * 2) {
+        processed = wc_chacha_encrypt_128(ctx->X, m, c);
+
+        bytes -= processed;
+        c += processed;
+        m += processed;
+        ctx->X[CHACHA_IV_BYTES] = PLUS(ctx->X[CHACHA_IV_BYTES],
+                                       processed / CHACHA_CHUNK_BYTES);
+    }
+    if (bytes > 0) {
+        wc_chacha_encrypt_64(ctx->X, m, c, bytes, (byte*)ctx->over);
+        if (bytes > CHACHA_CHUNK_BYTES)
+            ctx->X[CHACHA_IV_BYTES] = PLUSONE(ctx->X[CHACHA_IV_BYTES]);
+        ctx->left = CHACHA_CHUNK_BYTES - (bytes & (CHACHA_CHUNK_BYTES - 1));
+        ctx->left &= CHACHA_CHUNK_BYTES - 1;
+        ctx->X[CHACHA_IV_BYTES] = PLUSONE(ctx->X[CHACHA_IV_BYTES]);
+    }
+}
+
+#else
+
+#if !defined(WOLFSSL_RISCV_BIT_MANIPULATION)
+
+#define PART_ROUND_ODD_ABD(sl, sr)                      \
+        "add    " CC_A0 ", " CC_A0 ", " CC_B0 "\n\t"    \
+        "add    " CC_A1 ", " CC_A1 ", " CC_B1 "\n\t"    \
+        "add    " CC_A2 ", " CC_A2 ", " CC_B2 "\n\t"    \
+        "add    " CC_A3 ", " CC_A3 ", " CC_B3 "\n\t"    \
+        "xor    " CC_D0 ", " CC_D0 ", " CC_A0 "\n\t"    \
+        "xor    " CC_D1 ", " CC_D1 ", " CC_A1 "\n\t"    \
+        "xor    " CC_D2 ", " CC_D2 ", " CC_A2 "\n\t"    \
+        "xor    " CC_D3 ", " CC_D3 ", " CC_A3 "\n\t"    \
+        "slli   " CC_T0 ", " CC_D0 ", " #sl "\n\t"      \
+        "slli   " CC_T1 ", " CC_D1 ", " #sl "\n\t"      \
+        "slli   " CC_T2 ", " CC_D2 ", " #sl "\n\t"      \
+        "slli   " CC_T3 ", " CC_D3 ", " #sl "\n\t"      \
+        "srliw  " CC_D0 ", " CC_D0 ", " #sr "\n\t"      \
+        "srliw  " CC_D1 ", " CC_D1 ", " #sr "\n\t"      \
+        "srliw  " CC_D2 ", " CC_D2 ", " #sr "\n\t"      \
+        "srliw  " CC_D3 ", " CC_D3 ", " #sr "\n\t"      \
+        "or     " CC_D0 ", " CC_D0 ", " CC_T0 "\n\t"    \
+        "or     " CC_D1 ", " CC_D1 ", " CC_T1 "\n\t"    \
+        "or     " CC_D2 ", " CC_D2 ", " CC_T2 "\n\t"    \
+        "or     " CC_D3 ", " CC_D3 ", " CC_T3 "\n\t"
+
+#define PART_ROUND_ODD_CDB(sl, sr)                      \
+        "add    " CC_C0 ", " CC_C0 ", " CC_D0 "\n\t"    \
+        "add    " CC_C1 ", " CC_C1 ", " CC_D1 "\n\t"    \
+        "add    " CC_C2 ", " CC_C2 ", " CC_D2 "\n\t"    \
+        "add    " CC_C3 ", " CC_C3 ", " CC_D3 "\n\t"    \
+        "xor    " CC_B0 ", " CC_B0 ", " CC_C0 "\n\t"    \
+        "xor    " CC_B1 ", " CC_B1 ", " CC_C1 "\n\t"    \
+        "xor    " CC_B2 ", " CC_B2 ", " CC_C2 "\n\t"    \
+        "xor    " CC_B3 ", " CC_B3 ", " CC_C3 "\n\t"    \
+        "slli   " CC_T0 ", " CC_B0 ", " #sl "\n\t"      \
+        "slli   " CC_T1 ", " CC_B1 ", " #sl "\n\t"      \
+        "slli   " CC_T2 ", " CC_B2 ", " #sl "\n\t"      \
+        "slli   " CC_T3 ", " CC_B3 ", " #sl "\n\t"      \
+        "srliw  " CC_B0 ", " CC_B0 ", " #sr "\n\t"      \
+        "srliw  " CC_B1 ", " CC_B1 ", " #sr "\n\t"      \
+        "srliw  " CC_B2 ", " CC_B2 ", " #sr "\n\t"      \
+        "srliw  " CC_B3 ", " CC_B3 ", " #sr "\n\t"      \
+        "or     " CC_B0 ", " CC_B0 ", " CC_T0 "\n\t"    \
+        "or     " CC_B1 ", " CC_B1 ", " CC_T1 "\n\t"    \
+        "or     " CC_B2 ", " CC_B2 ", " CC_T2 "\n\t"    \
+        "or     " CC_B3 ", " CC_B3 ", " CC_T3 "\n\t"
+
+#define PART_ROUND_EVEN_ABD(sl, sr)                     \
+        "add    " CC_A0 ", " CC_A0 ", " CC_B1 "\n\t"    \
+        "add    " CC_A1 ", " CC_A1 ", " CC_B2 "\n\t"    \
+        "add    " CC_A2 ", " CC_A2 ", " CC_B3 "\n\t"    \
+        "add    " CC_A3 ", " CC_A3 ", " CC_B0 "\n\t"    \
+        "xor    " CC_D3 ", " CC_D3 ", " CC_A0 "\n\t"    \
+        "xor    " CC_D0 ", " CC_D0 ", " CC_A1 "\n\t"    \
+        "xor    " CC_D1 ", " CC_D1 ", " CC_A2 "\n\t"    \
+        "xor    " CC_D2 ", " CC_D2 ", " CC_A3 "\n\t"    \
+        "slli   " CC_T0 ", " CC_D3 ", " #sl "\n\t"      \
+        "slli   " CC_T1 ", " CC_D0 ", " #sl "\n\t"      \
+        "slli   " CC_T2 ", " CC_D1 ", " #sl "\n\t"      \
+        "slli   " CC_T3 ", " CC_D2 ", " #sl "\n\t"      \
+        "srliw  " CC_D3 ", " CC_D3 ", " #sr "\n\t"      \
+        "srliw  " CC_D0 ", " CC_D0 ", " #sr "\n\t"      \
+        "srliw  " CC_D1 ", " CC_D1 ", " #sr "\n\t"      \
+        "srliw  " CC_D2 ", " CC_D2 ", " #sr "\n\t"      \
+        "or     " CC_D3 ", " CC_D3 ", " CC_T0 "\n\t"    \
+        "or     " CC_D0 ", " CC_D0 ", " CC_T1 "\n\t"    \
+        "or     " CC_D1 ", " CC_D1 ", " CC_T2 "\n\t"    \
+        "or     " CC_D2 ", " CC_D2 ", " CC_T3 "\n\t"
+
+#define PART_ROUND_EVEN_CDB(sl, sr)                     \
+        "add    " CC_C2 ", " CC_C2 ", " CC_D3 "\n\t"    \
+        "add    " CC_C3 ", " CC_C3 ", " CC_D0 "\n\t"    \
+        "add    " CC_C0 ", " CC_C0 ", " CC_D1 "\n\t"    \
+        "add    " CC_C1 ", " CC_C1 ", " CC_D2 "\n\t"    \
+        "xor    " CC_B1 ", " CC_B1 ", " CC_C2 "\n\t"    \
+        "xor    " CC_B2 ", " CC_B2 ", " CC_C3 "\n\t"    \
+        "xor    " CC_B3 ", " CC_B3 ", " CC_C0 "\n\t"    \
+        "xor    " CC_B0 ", " CC_B0 ", " CC_C1 "\n\t"    \
+        "slli   " CC_T0 ", " CC_B1 ", " #sl "\n\t"      \
+        "slli   " CC_T1 ", " CC_B2 ", " #sl "\n\t"      \
+        "slli   " CC_T2 ", " CC_B3 ", " #sl "\n\t"      \
+        "slli   " CC_T3 ", " CC_B0 ", " #sl "\n\t"      \
+        "srliw  " CC_B1 ", " CC_B1 ", " #sr "\n\t"      \
+        "srliw  " CC_B2 ", " CC_B2 ", " #sr "\n\t"      \
+        "srliw  " CC_B3 ", " CC_B3 ", " #sr "\n\t"      \
+        "srliw  " CC_B0 ", " CC_B0 ", " #sr "\n\t"      \
+        "or     " CC_B1 ", " CC_B1 ", " CC_T0 "\n\t"    \
+        "or     " CC_B2 ", " CC_B2 ", " CC_T1 "\n\t"    \
+        "or     " CC_B3 ", " CC_B3 ", " CC_T2 "\n\t"    \
+        "or     " CC_B0 ", " CC_B0 ", " CC_T3 "\n\t"
+
+#else
+
+#define PART_ROUND_ODD_ABD(sl, sr)                      \
+        "add    " CC_A0 ", " CC_A0 ", " CC_B0 "\n\t"    \
+        "add    " CC_A1 ", " CC_A1 ", " CC_B1 "\n\t"    \
+        "add    " CC_A2 ", " CC_A2 ", " CC_B2 "\n\t"    \
+        "add    " CC_A3 ", " CC_A3 ", " CC_B3 "\n\t"    \
+        "xor    " CC_D0 ", " CC_D0 ", " CC_A0 "\n\t"    \
+        "xor    " CC_D1 ", " CC_D1 ", " CC_A1 "\n\t"    \
+        "xor    " CC_D2 ", " CC_D2 ", " CC_A2 "\n\t"    \
+        "xor    " CC_D3 ", " CC_D3 ", " CC_A3 "\n\t"    \
+        RORIW(REG_S6, REG_S6, sr)                       \
+        RORIW(REG_S7, REG_S7, sr)                       \
+        RORIW(REG_S8, REG_S8, sr)                       \
+        RORIW(REG_S9, REG_S9, sr)
+
+#define PART_ROUND_ODD_CDB(sl, sr)                      \
+        "add    " CC_C0 ", " CC_C0 ", " CC_D0 "\n\t"    \
+        "add    " CC_C1 ", " CC_C1 ", " CC_D1 "\n\t"    \
+        "add    " CC_C2 ", " CC_C2 ", " CC_D2 "\n\t"    \
+        "add    " CC_C3 ", " CC_C3 ", " CC_D3 "\n\t"    \
+        "xor    " CC_B0 ", " CC_B0 ", " CC_C0 "\n\t"    \
+        "xor    " CC_B1 ", " CC_B1 ", " CC_C1 "\n\t"    \
+        "xor    " CC_B2 ", " CC_B2 ", " CC_C2 "\n\t"    \
+        "xor    " CC_B3 ", " CC_B3 ", " CC_C3 "\n\t"    \
+        RORIW(REG_T3, REG_T3, sr)                       \
+        RORIW(REG_T4, REG_T4, sr)                       \
+        RORIW(REG_T5, REG_T5, sr)                       \
+        RORIW(REG_T6, REG_T6, sr)
+
+#define PART_ROUND_EVEN_ABD(sl, sr)                     \
+        "add    " CC_A0 ", " CC_A0 ", " CC_B1 "\n\t"    \
+        "add    " CC_A1 ", " CC_A1 ", " CC_B2 "\n\t"    \
+        "add    " CC_A2 ", " CC_A2 ", " CC_B3 "\n\t"    \
+        "add    " CC_A3 ", " CC_A3 ", " CC_B0 "\n\t"    \
+        "xor    " CC_D3 ", " CC_D3 ", " CC_A0 "\n\t"    \
+        "xor    " CC_D0 ", " CC_D0 ", " CC_A1 "\n\t"    \
+        "xor    " CC_D1 ", " CC_D1 ", " CC_A2 "\n\t"    \
+        "xor    " CC_D2 ", " CC_D2 ", " CC_A3 "\n\t"    \
+        RORIW(REG_S9, REG_S9, sr)                       \
+        RORIW(REG_S6, REG_S6, sr)                       \
+        RORIW(REG_S7, REG_S7, sr)                       \
+        RORIW(REG_S8, REG_S8, sr)
+
+#define PART_ROUND_EVEN_CDB(sl, sr)                     \
+        "add    " CC_C2 ", " CC_C2 ", " CC_D3 "\n\t"    \
+        "add    " CC_C3 ", " CC_C3 ", " CC_D0 "\n\t"    \
+        "add    " CC_C0 ", " CC_C0 ", " CC_D1 "\n\t"    \
+        "add    " CC_C1 ", " CC_C1 ", " CC_D2 "\n\t"    \
+        "xor    " CC_B1 ", " CC_B1 ", " CC_C2 "\n\t"    \
+        "xor    " CC_B2 ", " CC_B2 ", " CC_C3 "\n\t"    \
+        "xor    " CC_B3 ", " CC_B3 ", " CC_C0 "\n\t"    \
+        "xor    " CC_B0 ", " CC_B0 ", " CC_C1 "\n\t"    \
+        RORIW(REG_T4, REG_T4, sr)                       \
+        RORIW(REG_T5, REG_T5, sr)                       \
+        RORIW(REG_T6, REG_T6, sr)                       \
+        RORIW(REG_T3, REG_T3, sr)
+
+#endif
+
+#define QUARTER_ROUND_ODD()                 \
+        /* a += b; d ^= a; d <<<= 16; */    \
+        PART_ROUND_ODD_ABD(16, 16)          \
+        /* c += d; b ^= c; b <<<= 12; */    \
+        PART_ROUND_ODD_CDB(12, 20)          \
+        /* a += b; d ^= a; d <<<= 8; */     \
+        PART_ROUND_ODD_ABD( 8, 24)          \
+        /* c += d; b ^= c; b <<<= 7; */     \
+        PART_ROUND_ODD_CDB( 7, 25)
+
+#define QUARTER_ROUND_EVEN()                \
+        /* a += b; d ^= a; d <<<= 16; */    \
+        PART_ROUND_EVEN_ABD(16, 16)         \
+        /* c += d; b ^= c; b <<<= 12; */    \
+        PART_ROUND_EVEN_CDB(12, 20)         \
+        /* a += b; d ^= a; d <<<= 8; */     \
+        PART_ROUND_EVEN_ABD( 8, 24)         \
+        /* c += d; b ^= c; b <<<= 7; */     \
+        PART_ROUND_EVEN_CDB( 7, 25)
+
+
+static WC_INLINE void wc_chacha_encrypt(const word32* input, const byte* m,
+    byte* c, word32 bytes, word32* over)
+{
+    word64 bytes64 = (word64)bytes;
+
+    __asm__ __volatile__ (
+    "L_chacha20_riscv_outer:\n\t"
+        /* Move state into regular registers */
+        "ld     a4,  0(%[input])\n\t"
+        "ld     a6,  8(%[input])\n\t"
+        "ld     t3, 16(%[input])\n\t"
+        "ld     t5, 24(%[input])\n\t"
+        "ld     s2, 32(%[input])\n\t"
+        "ld     s4, 40(%[input])\n\t"
+        "ld     s6, 48(%[input])\n\t"
+        "ld     s8, 56(%[input])\n\t"
+        "srli   a5, a4, 32\n\t"
+        "srli   a7, a6, 32\n\t"
+        "srli   t4, t3, 32\n\t"
+        "srli   t6, t5, 32\n\t"
+        "srli   s3, s2, 32\n\t"
+        "srli   s5, s4, 32\n\t"
+        "srli   s7, s6, 32\n\t"
+        "srli   s9, s8, 32\n\t"
+
+        /* Set number of odd+even rounds to perform */
+        "li     a3, 10\n\t"
+        "\n"
+    "L_chacha20_riscv_loop:\n\t"
+        /* Odd Round */
+        QUARTER_ROUND_ODD()
+        /* Even Round */
+        QUARTER_ROUND_EVEN()
+        "addi   a3, a3, -1\n\t"
+        "bnez   a3, L_chacha20_riscv_loop\n\t"
+
+        "ld     t0, 0(%[input])\n\t"
+        "ld     t1, 8(%[input])\n\t"
+        "ld     t2, 16(%[input])\n\t"
+        "ld     s1, 24(%[input])\n\t"
+        "add    a4, a4, t0\n\t"
+        "add    a6, a6, t1\n\t"
+        "add    t3, t3, t2\n\t"
+        "add    t5, t5, s1\n\t"
+        "srli   t0, t0, 32\n\t"
+        "srli   t1, t1, 32\n\t"
+        "srli   t2, t2, 32\n\t"
+        "srli   s1, s1, 32\n\t"
+        "add    a5, a5, t0\n\t"
+        "add    a7, a7, t1\n\t"
+        "add    t4, t4, t2\n\t"
+        "add    t6, t6, s1\n\t"
+        "ld     t0, 32(%[input])\n\t"
+        "ld     t1, 40(%[input])\n\t"
+        "ld     t2, 48(%[input])\n\t"
+        "ld     s1, 56(%[input])\n\t"
+        "add    s2, s2, t0\n\t"
+        "add    s4, s4, t1\n\t"
+        "add    s6, s6, t2\n\t"
+        "add    s8, s8, s1\n\t"
+        "srli   t0, t0, 32\n\t"
+        "srli   t1, t1, 32\n\t"
+        "srli   t2, t2, 32\n\t"
+        "srli   s1, s1, 32\n\t"
+        "add    s3, s3, t0\n\t"
+        "add    s5, s5, t1\n\t"
+        "add    s7, s7, t2\n\t"
+        "add    s9, s9, s1\n\t"
+
+        "addi   %[bytes], %[bytes], -64\n\t"
+        "bgez   %[bytes], L_chacha20_riscv_xor\n\t"
+        "addi   a3, %[bytes], 64\n\t"
+
+        "sw     a4,  0(%[over])\n\t"
+        "sw     a5,  4(%[over])\n\t"
+        "sw     a6,  8(%[over])\n\t"
+        "sw     a7, 12(%[over])\n\t"
+        "sw     t3, 16(%[over])\n\t"
+        "sw     t4, 20(%[over])\n\t"
+        "sw     t5, 24(%[over])\n\t"
+        "sw     t6, 28(%[over])\n\t"
+        "sw     s2, 32(%[over])\n\t"
+        "sw     s3, 36(%[over])\n\t"
+        "sw     s4, 40(%[over])\n\t"
+        "sw     s5, 44(%[over])\n\t"
+        "sw     s6, 48(%[over])\n\t"
+        "sw     s7, 52(%[over])\n\t"
+        "sw     s8, 56(%[over])\n\t"
+        "sw     s9, 60(%[over])\n\t"
+
+        "addi   a3, a3, -1\n\t"
+     "L_chacha20_riscv_byte_loop:\n\t"
+        "lb     t0, (%[m])\n\t"
+        "lb     t1, (%[over])\n\t"
+        "xor    t0, t0, t1\n\t"
+        "sb     t0, (%[c])\n\t"
+        "addi   %[m], %[m], 1\n\t"
+        "addi   %[c], %[c], 1\n\t"
+        "addi   %[over], %[over], 1\n\t"
+        "addi   a3, a3, -1\n\t"
+        "bgez   a3, L_chacha20_riscv_byte_loop\n\t"
+
+        "lw     t0, 48(%[input])\n\t"
+        "addi   t0, t0, 1\n\t"
+        "sw     t0, 48(%[input])\n\t"
+        "bltz   %[bytes], L_chacha20_riscv_done\n\t"
+
+     "L_chacha20_riscv_xor:\n\t"
+#if !defined(WOLFSSL_RISCV_BIT_MANIPULATION)
+        "ld     t0, 0(%[m])\n\t"
+        "ld     t1, 8(%[m])\n\t"
+        "ld     t2, 16(%[m])\n\t"
+        "ld     s1, 24(%[m])\n\t"
+        "xor    a4, a4, t0\n\t"
+        "xor    a6, a6, t1\n\t"
+        "xor    t3, t3, t2\n\t"
+        "xor    t5, t5, s1\n\t"
+        "srli   t0, t0, 32\n\t"
+        "srli   t1, t1, 32\n\t"
+        "srli   t2, t2, 32\n\t"
+        "srli   s1, s1, 32\n\t"
+        "xor    a5, a5, t0\n\t"
+        "xor    a7, a7, t1\n\t"
+        "xor    t4, t4, t2\n\t"
+        "xor    t6, t6, s1\n\t"
+        "ld     t0, 32(%[m])\n\t"
+        "ld     t1, 40(%[m])\n\t"
+        "ld     t2, 48(%[m])\n\t"
+        "ld     s1, 56(%[m])\n\t"
+        "xor    s2, s2, t0\n\t"
+        "xor    s4, s4, t1\n\t"
+        "xor    s6, s6, t2\n\t"
+        "xor    s8, s8, s1\n\t"
+        "srli   t0, t0, 32\n\t"
+        "srli   t1, t1, 32\n\t"
+        "srli   t2, t2, 32\n\t"
+        "srli   s1, s1, 32\n\t"
+        "xor    s3, s3, t0\n\t"
+        "xor    s5, s5, t1\n\t"
+        "xor    s7, s7, t2\n\t"
+        "xor    s9, s9, s1\n\t"
+        "sw     a4, 0(%[c])\n\t"
+        "sw     a5, 4(%[c])\n\t"
+        "sw     a6, 8(%[c])\n\t"
+        "sw     a7, 12(%[c])\n\t"
+        "sw     t3, 16(%[c])\n\t"
+        "sw     t4, 20(%[c])\n\t"
+        "sw     t5, 24(%[c])\n\t"
+        "sw     t6, 28(%[c])\n\t"
+        "sw     s2, 32(%[c])\n\t"
+        "sw     s3, 36(%[c])\n\t"
+        "sw     s4, 40(%[c])\n\t"
+        "sw     s5, 44(%[c])\n\t"
+        "sw     s6, 48(%[c])\n\t"
+        "sw     s7, 52(%[c])\n\t"
+        "sw     s8, 56(%[c])\n\t"
+        "sw     s9, 60(%[c])\n\t"
+#else
+        PACK(REG_A4, REG_A4, REG_A5)
+        PACK(REG_A6, REG_A6, REG_A7)
+        PACK(REG_T3, REG_T3, REG_T4)
+        PACK(REG_T5, REG_T5, REG_T6)
+        PACK(REG_S2, REG_S2, REG_S3)
+        PACK(REG_S4, REG_S4, REG_S5)
+        PACK(REG_S6, REG_S6, REG_S7)
+        PACK(REG_S8, REG_S8, REG_S9)
+        "ld     a5, 0(%[m])\n\t"
+        "ld     a7, 8(%[m])\n\t"
+        "ld     t4, 16(%[m])\n\t"
+        "ld     t6, 24(%[m])\n\t"
+        "ld     s3, 32(%[m])\n\t"
+        "ld     s5, 40(%[m])\n\t"
+        "ld     s7, 48(%[m])\n\t"
+        "ld     s9, 56(%[m])\n\t"
+        "xor    a4, a4, a5\n\t"
+        "xor    a6, a6, a7\n\t"
+        "xor    t3, t3, t4\n\t"
+        "xor    t5, t5, t6\n\t"
+        "xor    s2, s2, s3\n\t"
+        "xor    s4, s4, s5\n\t"
+        "xor    s6, s6, s7\n\t"
+        "xor    s8, s8, s9\n\t"
+        "sd     a4, 0(%[c])\n\t"
+        "sd     a6, 8(%[c])\n\t"
+        "sd     t3, 16(%[c])\n\t"
+        "sd     t5, 24(%[c])\n\t"
+        "sd     s2, 32(%[c])\n\t"
+        "sd     s4, 40(%[c])\n\t"
+        "sd     s6, 48(%[c])\n\t"
+        "sd     s8, 56(%[c])\n\t"
+#endif
+
+        "lw     t0, 48(%[input])\n\t"
+        "addi   %[m], %[m], 64\n\t"
+        "addi   t0, t0, 1\n\t"
+        "addi   %[c], %[c], 64\n\t"
+        "sw     t0, 48(%[input])\n\t"
+
+        "bnez   %[bytes], L_chacha20_riscv_outer\n\t"
+
+     "L_chacha20_riscv_done:\n\t"
+        : [m] "+r" (m), [c] "+r" (c), [bytes] "+r" (bytes64), [over] "+r" (over)
+        : [input] "r" (input)
+        : "memory", "t0", "t1", "t2", "s1", "a3",
+          "t3", "t4", "t5", "t6",
+          "a4", "a5", "a6", "a7",
+          "s2", "s3", "s4", "s5",
+          "s6", "s7", "s8", "s9"
+    );
+}
+
+/**
+ * Encrypt a stream of bytes
+ */
+static void wc_chacha_encrypt_bytes(ChaCha* ctx, const byte* m, byte* c,
+    word32 bytes)
+{
+    wc_chacha_encrypt(ctx->X, m, c, bytes, ctx->over);
+    ctx->left = CHACHA_CHUNK_BYTES - (bytes & (CHACHA_CHUNK_BYTES - 1));
+    ctx->left &= CHACHA_CHUNK_BYTES - 1;
+}
+#endif
+
+/**
+ * API to encrypt/decrypt a message of any size.
+ */
+int wc_Chacha_Process(ChaCha* ctx, byte* output, const byte* input,
+    word32 msglen)
+{
+    int ret = 0;
+
+    if ((ctx == NULL) || (output == NULL) || (input == NULL)) {
+        ret = BAD_FUNC_ARG;
+    }
+    else {
+        /* handle left overs */
+        if (msglen > 0 && ctx->left > 0) {
+            byte*  out;
+            word32 i;
+
+            out = (byte*)ctx->over + CHACHA_CHUNK_BYTES - ctx->left;
+            for (i = 0; i < msglen && i < ctx->left; i++) {
+                output[i] = (byte)(input[i] ^ out[i]);
+            }
+            ctx->left -= i;
+
+            msglen -= i;
+            output += i;
+            input += i;
+        }
+
+        if (msglen != 0) {
+            wc_chacha_encrypt_bytes(ctx, input, output, msglen);
+        }
+    }
+
+    return ret;
+}
+
+#endif /* HAVE_CHACHA */
+#endif /* WOLFSSL_ARMASM && !WOLFSSL_ARMASM_NO_NEON */
diff --git a/wolfcrypt/src/port/riscv/riscv-64-sha256.c b/wolfcrypt/src/port/riscv/riscv-64-sha256.c
index 62d26745e1..3c546b00d7 100644
--- a/wolfcrypt/src/port/riscv/riscv-64-sha256.c
+++ b/wolfcrypt/src/port/riscv/riscv-64-sha256.c
@@ -846,41 +846,41 @@ static WC_INLINE void Sha256Final(wc_Sha256* sha256, byte* hash)
 #elif defined(WOLFSSL_RISCV_BASE_BIT_MANIPULATION)
         "ld     t1, 0(%[digest])\n\t"
         "ld     t3, 8(%[digest])\n\t"
-        "ld     s1, 16(%[digest])\n\t"
-        "ld     s3, 24(%[digest])\n\t"
+        "ld     a5, 16(%[digest])\n\t"
+        "ld     a7, 24(%[digest])\n\t"
         REV8(REG_T1, REG_T1)
         REV8(REG_T3, REG_T3)
-        REV8(REG_S1, REG_S1)
-        REV8(REG_S3, REG_S3)
+        REV8(REG_A5, REG_A5)
+        REV8(REG_A7, REG_A7)
         "srli   t0, t1, 32\n\t"
         "srli   t2, t3, 32\n\t"
-        "srli   s0, s1, 32\n\t"
-        "srli   s2, s3, 32\n\t"
+        "srli   a4, a5, 32\n\t"
+        "srli   a6, a7, 32\n\t"
         "sw     t0, 0(%[hash])\n\t"
         "sw     t1, 4(%[hash])\n\t"
         "sw     t2, 8(%[hash])\n\t"
         "sw     t3, 12(%[hash])\n\t"
-        "sw     s0, 16(%[hash])\n\t"
-        "sw     s1, 20(%[hash])\n\t"
-        "sw     s2, 24(%[hash])\n\t"
-        "sw     s3, 28(%[hash])\n\t"
+        "sw     a4, 16(%[hash])\n\t"
+        "sw     a5, 20(%[hash])\n\t"
+        "sw     a6, 24(%[hash])\n\t"
+        "sw     a7, 28(%[hash])\n\t"
 #else
         LOAD_WORD_REV(t0, 0, %[digest], t2, t3, t4)
         LOAD_WORD_REV(t1, 4, %[digest], t2, t3, t4)
-        LOAD_WORD_REV(s0, 8, %[digest], t2, t3, t4)
-        LOAD_WORD_REV(s1, 12, %[digest], t2, t3, t4)
+        LOAD_WORD_REV(a4, 8, %[digest], t2, t3, t4)
+        LOAD_WORD_REV(a5, 12, %[digest], t2, t3, t4)
         "sw     t0, 0(%[hash])\n\t"
         "sw     t1, 4(%[hash])\n\t"
-        "sw     s0, 8(%[hash])\n\t"
-        "sw     s1, 12(%[hash])\n\t"
+        "sw     a4, 8(%[hash])\n\t"
+        "sw     a5, 12(%[hash])\n\t"
         LOAD_WORD_REV(t0, 16, %[digest], t2, t3, t4)
         LOAD_WORD_REV(t1, 20, %[digest], t2, t3, t4)
-        LOAD_WORD_REV(s0, 24, %[digest], t2, t3, t4)
-        LOAD_WORD_REV(s1, 28, %[digest], t2, t3, t4)
+        LOAD_WORD_REV(a4, 24, %[digest], t2, t3, t4)
+        LOAD_WORD_REV(a5, 28, %[digest], t2, t3, t4)
         "sw     t0, 16(%[hash])\n\t"
         "sw     t1, 20(%[hash])\n\t"
-        "sw     s0, 24(%[hash])\n\t"
-        "sw     s1, 28(%[hash])\n\t"
+        "sw     a4, 24(%[hash])\n\t"
+        "sw     a5, 28(%[hash])\n\t"
 #endif
         :
         : [digest] "r" (sha256->digest), [hash] "r" (hash)
@@ -889,7 +889,7 @@ static WC_INLINE void Sha256Final(wc_Sha256* sha256, byte* hash)
           , [rev_idx] "r" (rev_idx)
 #endif
         : "cc", "memory", "t0", "t1", "t2", "t3", "t4", "t5", "t6",
-          "s0", "s1", "s2", "s3"
+          "a4", "a5", "a6", "a7"
     );
 }
 
diff --git a/wolfcrypt/test/test.c b/wolfcrypt/test/test.c
index 7c357e6818..a0720ca6c3 100644
--- a/wolfcrypt/test/test.c
+++ b/wolfcrypt/test/test.c
@@ -7789,10 +7789,10 @@ WOLFSSL_TEST_SUBROUTINE wc_test_ret_t chacha_test(void)
             return WC_TEST_RET_ENC_EC(ret);
 
         if (XMEMCMP(plain_big, input_big, CHACHA_BIG_TEST_SIZE))
-            return WC_TEST_RET_ENC_NC;
+            return WC_TEST_RET_ENC_I(i);
 
         if (XMEMCMP(cipher_big, cipher_big_result, CHACHA_BIG_TEST_SIZE))
-            return WC_TEST_RET_ENC_NC;
+            return WC_TEST_RET_ENC_I(i);
     }
 
 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_NO_MALLOC)
diff --git a/wolfssl/wolfcrypt/chacha.h b/wolfssl/wolfcrypt/chacha.h
index a430224e05..c3af0507af 100644
--- a/wolfssl/wolfcrypt/chacha.h
+++ b/wolfssl/wolfcrypt/chacha.h
@@ -82,7 +82,8 @@ typedef struct ChaCha {
     byte extra[12];
 #endif
     word32 left;                            /* number of bytes leftover */
-#if defined(USE_INTEL_CHACHA_SPEEDUP) || defined(WOLFSSL_ARMASM)
+#if defined(USE_INTEL_CHACHA_SPEEDUP) || defined(WOLFSSL_ARMASM) || \
+    defined(WOLFSSL_RISCV_ASM)
     word32 over[CHACHA_CHUNK_WORDS];
 #endif
 } ChaCha;
diff --git a/wolfssl/wolfcrypt/port/riscv/riscv-64-asm.h b/wolfssl/wolfcrypt/port/riscv/riscv-64-asm.h
index 5407654ee9..e9d200f916 100644
--- a/wolfssl/wolfcrypt/port/riscv/riscv-64-asm.h
+++ b/wolfssl/wolfcrypt/port/riscv/riscv-64-asm.h
@@ -137,6 +137,12 @@
              (0b0010011 << 0) |                             \
              (rs << 15) | (rd << 7))
 
+#define RORIW(rd, rs, imm)                                  \
+    ASM_WORD((0b0110000 << 25) | (0b101 << 12) |            \
+             (0b0011011 << 0) |                             \
+             (imm << 20) | (rs << 15) | (rd << 7))
+
+
 /* rd = rs1[0..31] | rs2[0..31]. */
 #define PACK(rd, rs1, rs2)                                     \
     ASM_WORD((0b0000100 << 25) | (0b100 << 12) | 0b0110011 |   \
@@ -184,16 +190,36 @@
 /* Move from vector register to vector registor. */
 #define VMV_V_V(vd, vs1)                                        \
     ASM_WORD((0b1010111 << 0) | (0b000 << 12) | (0b1 << 25) |   \
-        (0b010111 << 26) | (vd << 7) | (vs1 << 15))
+        (0b010111 << 26) | ((vd) << 7) | ((vs1) << 15))
 /* Splat register to each component of the vector registor. */
 #define VMV_V_X(vd, rs1)                                        \
     ASM_WORD((0b1010111 << 0) | (0b100 << 12) | (0b1 << 25) |   \
-        (0b010111 << 26) | (vd << 7) | (rs1 << 15))
+        (0b010111 << 26) | ((vd) << 7) | ((rs1) << 15))
+/* Splat immediate to each component of the vector registor. */
+#define VMV_V_I(vd, imm)                                        \
+    ASM_WORD((0b1010111 << 0) | (0b011 << 12) | (0b1 << 25) |   \
+        (0b010111 << 26) | ((vd) << 7) | ((imm) << 15))
 /* Move n vector registers to vector registers. */
 #define VMVR_V(vd, vs2, n)                                      \
     ASM_WORD((0b1010111 << 0) | (0b011 << 12) | (0b1 << 25) |   \
-        (0b100111 << 26) | (vd << 7) | ((n-1) << 15) |          \
-        (vs2 << 20))
+        (0b100111 << 26) | ((vd) << 7) | ((n-1) << 15) |        \
+        ((vs2) << 20))
+
+
+/*
+ * Logic
+ */
+
+/* vd = vs2 << uimm */
+#define VSLL_VI(vd, vs2, uimm)                      \
+    ASM_WORD((0b100101 << 26) | (0b1 << 25) |       \
+             (0b011 << 12) | (0b1010111 << 0) |     \
+             (vd << 7) | (uimm << 15) | (vs2 << 20))
+/* vd = vs2 >> uimm */
+#define VSRL_VI(vd, vs2, uimm)                      \
+    ASM_WORD((0b101000 << 26) | (0b1 << 25) |       \
+             (0b011 << 12) | (0b1010111 << 0) |     \
+             (vd << 7) | (uimm << 15) | (vs2 << 20))
 
 
 /*
@@ -235,13 +261,13 @@
 #define VMV_X_S(rd, vs2)                            \
     ASM_WORD((0b010000 << 26) | (0b1 << 25) |       \
              (0b010 << 12) | (0b1010111 << 0) |     \
-             (rd << 7) | (vs2 << 20))
+             ((rd) << 7) | ((vs2) << 20))
 
 /* vd[0] = x[rs1] */
 #define VMV_S_X(vd, rs1)                            \
     ASM_WORD((0b010000 << 26) | (0b1 << 25) |       \
              (0b110 << 12) | (0b1010111 << 0) |     \
-             (vd << 7) | (rs1 << 15))
+             ((vd) << 7) | ((rs1) << 15))
 
 /* vd[shift..max] = vs2[0..max-shift]
  * Sliding up doesn't change bottom part of destination.
@@ -249,7 +275,7 @@
 #define VSLIDEUP_VI(vd, vs2, shift)                 \
     ASM_WORD((0b001110 << 26) | (0b1 << 25) |       \
              (0b011 << 12) | (0b1010111 << 0) |     \
-             (vd << 7) | (shift << 15) | (vs2 << 20))
+             ((vd) << 7) | ((shift) << 15) | ((vs2) << 20))
 
 /* vd[0..max-shift] = vs2[shift..max]
  * Sliding down change top part of destination.
@@ -257,13 +283,18 @@
 #define VSLIDEDOWN_VI(vd, vs2, shift)               \
     ASM_WORD((0b001111 << 26) | (0b1 << 25) |       \
              (0b011 << 12) | (0b1010111 << 0) |     \
-             (vd << 7) | (shift << 15) | (vs2 << 20))
+             ((vd) << 7) | ((shift) << 15) | ((vs2) << 20))
 
 /* vd[i] = vs1[vs2[i]] */
 #define VRGATHER_VV(vd, vs1, vs2)                   \
     ASM_WORD((0b001100 << 26) | (0b1 << 25) |       \
              (0b000 << 12) | (0b1010111 << 0) |     \
-             (vd << 7) | (vs1 << 15) | (vs2 << 20))
+             ((vd) << 7) | ((vs1) << 15) | ((vs2) << 20))
+
+#define VID_V(vd)                                   \
+    ASM_WORD((0b010100 << 26) | (0b1 << 25) | (0b00000 << 20) |   \
+             (0b10001 << 15) | (0b010 << 12) |      \
+             (0b1010111 << 0) | ((vd) << 7))
 
 
 /*
@@ -281,15 +312,22 @@
     defined(WOLFSSL_RISCV_VECTOR_CRYPTO_ASM)
 
 /*
- * Bit Manipulation
+ * Vector Bit Manipulation
  */
 
 /* Reverse order of bytes in words of vector regsiter. */
 #define VREV8(vd, vs2) \
     ASM_WORD((0b010010 << 26) | (0b1 << 25) | (0b01001<< 15) | \
-             (0b010 << 12) | (0b1010111 << 0) | \
+             (0b010 << 12) | (0b1010111 << 0) |                \
              (vs2 << 20) | (vd << 7))
 
+/* Reverse order of bytes in words of vector regsiter. */
+#define VROR_VI(vd, imm, vs2) \
+    ASM_WORD((0b01010 << 27) | (0b1 << 25) | (0b011 << 12) |    \
+             (0b1010111 << 0) | ((imm >> 5) << 26) |            \
+             (vs2 << 20) | ((imm & 0x1f) << 15) | (vd << 7))
+
+
 #endif /* WOLFSSL_RISCV_VECTOR_BASE_BIT_MANIPULATION ||
         * WOLFSSL_RISCV_VECTOR_CRYPTO_ASM */