diff --git a/doc/migrate-4.rst b/doc/migrate-4.rst
index c6000dee0..4e0d67cfe 100644
--- a/doc/migrate-4.rst
+++ b/doc/migrate-4.rst
@@ -107,7 +107,7 @@ autotools                              meson
 ``--without-posix-semaphores``         ``posix_semaphores=disabled``
 ``--without-pthread_setaffinity_np``   ``pthread_setaffinity_np=disabled``
 ``--without-fmv``                      ``fmv=disabled``
-``--without-movntdq``                  ``movntdq=disabled``
+``--without-movntdq``                  ``sse2_stream=disabled``
 ``--without-cuda``                     ``cuda=disabled``
 ``--without-gdrapi``                   ``gdrapi=disabled``
 ====================================== =====================================
diff --git a/include/spead2/common_defines.h b/include/spead2/common_defines.h
index bd35ad223..8d9a977a7 100644
--- a/include/spead2/common_defines.h
+++ b/include/spead2/common_defines.h
@@ -34,12 +34,6 @@
 #define SPEAD2_MAX_LOG_LEVEL (spead2::log_level::info)
 #endif
 
-#if SPEAD2_USE_FMV
-# define SPEAD2_FMV_TARGET(x) [[gnu::target(x)]]
-#else
-# define SPEAD2_FMV_TARGET(x)
-#endif
-
 /**
  * SPEAD protocol sending and receiving. All SPEAD-64-* flavours are
  * supported.
diff --git a/include/spead2/common_features.h.in b/include/spead2/common_features.h.in
index 0cf104afd..89242bfa7 100644
--- a/include/spead2/common_features.h.in
+++ b/include/spead2/common_features.h.in
@@ -38,25 +38,33 @@
 /* Python on MacOS likes to build universal binaries, which causes problems
  * because it doesn't match the compilation environment detected at
  * configuration time. So features that we only use on x86 are only enabled
- * if x86 is actually detected at build time.
+ * if x86 is actually detected at build time, and if function
+ * multi-versioning is available.
  */
 #if defined(__i386__) || defined(__i386) || defined(__x86_64__) || defined(__x86_64)
 
 # define SPEAD2_USE_FMV @SPEAD2_USE_FMV@
-/* On i386, MOVNTDQ is not guaranteed to exist at runtime, and we need function
- * multi-versioning to make it safe to use.
- */
-# if SPEAD2_USE_FMV || defined(__x86_64__) || defined(__x86_64)
-#  define SPEAD2_USE_MOVNTDQ @SPEAD2_USE_MOVNTDQ@
-# else
-#  define SPEAD2_USE_MOVNTDQ 0
+# if SPEAD2_USE_FMV
+#  define SPEAD2_USE_SSE2_STREAM @SPEAD2_USE_SSE2_STREAM@
+#  define SPEAD2_USE_AVX_STREAM @SPEAD2_USE_AVX_STREAM@
+#  define SPEAD2_USE_AVX512_STREAM @SPEAD2_USE_AVX512_STREAM@
+# elif defined(__x86_64__) || defined(__x86_64)
+#  define SPEAD2_USE_SSE2_STREAM @SPEAD2_USE_SSE2_STREAM@  /* Guaranteed to exist on x86_64 */
 # endif
 
-#else  // not x86
+#endif
 
+#ifndef SPEAD2_USE_FMV
 # define SPEAD2_USE_FMV 0
-# define SPEAD2_USE_MOVNTDQ 0
-
+#endif
+#ifndef SPEAD2_USE_SSE2_STREAM
+# define SPEAD2_USE_SSE2_STREAM 0
+#endif
+#ifndef SPEAD2_USE_AVX_STREAM
+# define SPEAD2_USE_AVX_STREAM 0
+#endif
+#ifndef SPEAD2_USE_AVX512_STREAM
+# define SPEAD2_USE_AVX512_STREAM 0
 #endif
 
 #define SPEAD2_USE_POSIX_SEMAPHORES @SPEAD2_USE_POSIX_SEMAPHORES@
diff --git a/include/spead2/common_memcpy.h b/include/spead2/common_memcpy.h
index a522d7448..e44f97fdd 100644
--- a/include/spead2/common_memcpy.h
+++ b/include/spead2/common_memcpy.h
@@ -31,15 +31,7 @@
 namespace spead2
 {
 
-#if SPEAD2_USE_FMV || !SPEAD2_USE_MOVNTDQ
-SPEAD2_FMV_TARGET("default")
 void *memcpy_nontemporal(void * __restrict__ dest, const void * __restrict__ src, std::size_t n) noexcept;
-#endif
-
-#if SPEAD2_USE_MOVNTDQ
-SPEAD2_FMV_TARGET("sse2")
-void *memcpy_nontemporal(void * __restrict__ dest, const void * __restrict__ src, std::size_t n) noexcept;
-#endif
 
 } // namespace spead2
 
diff --git a/meson.build b/meson.build
index 518af5a28..13ff12eac 100644
--- a/meson.build
+++ b/meson.build
@@ -191,26 +191,49 @@ use_pthread_setaffinity_np = get_option('pthread_setaffinity_np').require(
     dependencies : thread_dep
   )
 ).allowed()
-use_fmv = get_option('fmv').require(
+use_fmv = get_option('fmv').require(compiler.has_function_attribute('ifunc')).allowed()
+# has_function doesn't work for _mm_stream_si128 and friends, because they
+# are inline-only functions in GCC without external definitions.
+use_sse2_stream = get_option('sse2_stream').require(
   compiler.compiles(
-    '__attribute__((target("default"))) void foo() {}',
-    args : '-Werror',  # TODO: use werror kwarg once meson 1.3 is in use
-    name : 'function multi-versioning'
+    '''
+    #include <emmintrin.h>
+
+    [[gnu::target("sse2")]]
+    void foo()
+    {
+        _mm_stream_si128((__m128i *) NULL, __m128i());
+    }
+    ''',
+    name : 'SSE2 streaming intrinsic'
   )
 ).allowed()
-# has_function doesn't work for _mm_stream_si128, because it is an
-# inline-only function in GCC without an external definition.
-use_movntdq = get_option('movntdq').require(
+use_avx_stream = get_option('avx_stream').require(
   compiler.compiles(
     '''
-    #include <emmintrin.h>
+    #include <immintrin.h>
 
+    [[gnu::target("avx")]]
     void foo()
     {
-        _mm_stream_si128((__m128i *) NULL, __m128i());
+        _mm256_stream_si256((__m256i *) NULL, __m256i());
+    }
+    ''',
+    name : 'AVX streaming intrinsic'
+  )
+).allowed()
+use_avx512_stream = get_option('avx512_stream').require(
+  compiler.compiles(
+    '''
+    #include <immintrin.h>
+
+    [[gnu::target("avx512f")]]
+    void foo()
+    {
+        _mm512_stream_si512((__m512i *) NULL, __m512i());
     }
     ''',
-    name : 'MOVNTDQ intrinsic'
+    name : 'AVX-512 streaming intrinsic'
   )
 ).allowed()
 
@@ -230,7 +253,9 @@ conf.set10('SPEAD2_USE_EVENTFD', use_eventfd)
 conf.set10('SPEAD2_USE_POSIX_SEMAPHORES', use_posix_semaphores)
 conf.set10('SPEAD2_USE_PTHREAD_SETAFFINITY_NP', use_pthread_setaffinity_np)
 conf.set10('SPEAD2_USE_FMV', use_fmv)
-conf.set10('SPEAD2_USE_MOVNTDQ', use_movntdq)
+conf.set10('SPEAD2_USE_SSE2_STREAM', use_sse2_stream)
+conf.set10('SPEAD2_USE_AVX_STREAM', use_avx_stream)
+conf.set10('SPEAD2_USE_AVX512_STREAM', use_avx512_stream)
 conf.set10('SPEAD2_USE_PCAP', pcap_dep.found())
 conf.set('SPEAD2_MAX_LOG_LEVEL', '(spead2::log_level::' + get_option('max_log_level') + ')')
 
diff --git a/meson.options b/meson.options
index a4ada16e0..65187a277 100644
--- a/meson.options
+++ b/meson.options
@@ -25,7 +25,9 @@ option('eventfd', type : 'feature', description : 'Use eventfd system call for s
 option('posix_semaphores', type : 'feature', description : 'Use POSIX semaphores')
 option('pthread_setaffinity_np', type : 'feature', description : 'Use pthread_setaffinity_np to set thread affinity')
 option('fmv', type : 'feature', description : 'Use function multi-versioning')
-option('movntdq', type : 'feature', description : 'Use MOVNTDQ instruction for non-temporal stores')
+option('sse2_stream', type : 'feature', description : 'Use SSE2 for non-temporal stores')
+option('avx_stream', type : 'feature', description : 'Use AVX for non-temporal stores')
+option('avx512_stream', type : 'feature', description : 'Use AVX-512 for non-temporal stores')
 option('cuda', type : 'feature', description : 'Build CUDA examples')
 option('gdrapi', type : 'feature', description : 'Build gdrcopy examples')
 option('unit_test', type : 'feature', description : 'Build the unit tests')
diff --git a/src/common_memcpy.cpp b/src/common_memcpy.cpp
index 860ce43ec..59db889ce 100644
--- a/src/common_memcpy.cpp
+++ b/src/common_memcpy.cpp
@@ -1,4 +1,4 @@
-/* Copyright 2016, 2020 National Research Foundation (SARAO)
+/* Copyright 2016, 2020, 2023 National Research Foundation (SARAO)
  *
  * This program is free software: you can redistribute it and/or modify it under
  * the terms of the GNU Lesser General Public License as published by the Free
@@ -17,70 +17,94 @@
 #include <cstddef>
 #include <cstdint>
 #include <cstring>
+#include <utility>
 #include <spead2/common_defines.h>
 #include <spead2/common_features.h>
 #include <spead2/common_memcpy.h>
-#if SPEAD2_USE_MOVNTDQ
+
+#if SPEAD2_USE_SSE2_STREAM
 # include <emmintrin.h>
+# define SPEAD2_MEMCPY_NAME memcpy_nontemporal_sse2
+# define SPEAD2_MEMCPY_TARGET "sse2"
+# define SPEAD2_MEMCPY_TYPE __m128i
+# define SPEAD2_MEMCPY_LOAD _mm_loadu_si128
+# define SPEAD2_MEMCPY_STORE _mm_stream_si128
+# define SPEAD2_MEMCPY_UNROLL 16
+# include "common_memcpy_impl.h"
+#endif
+
+#if SPEAD2_USE_AVX_STREAM
+# include <immintrin.h>
+# define SPEAD2_MEMCPY_NAME memcpy_nontemporal_avx
+# define SPEAD2_MEMCPY_TARGET "avx"
+# define SPEAD2_MEMCPY_TYPE __m256i
+# define SPEAD2_MEMCPY_LOAD _mm256_loadu_si256
+# define SPEAD2_MEMCPY_STORE _mm256_stream_si256
+# define SPEAD2_MEMCPY_UNROLL 8
+# include "common_memcpy_impl.h"
+#endif
+
+#if SPEAD2_USE_AVX512_STREAM
+# include <immintrin.h>
+# define SPEAD2_MEMCPY_NAME memcpy_nontemporal_avx512
+# define SPEAD2_MEMCPY_TARGET "avx512f"
+# define SPEAD2_MEMCPY_TYPE __m512i
+# define SPEAD2_MEMCPY_LOAD _mm512_loadu_si512
+# define SPEAD2_MEMCPY_STORE _mm512_stream_si512
+# define SPEAD2_MEMCPY_UNROLL 8
+# include "common_memcpy_impl.h"
 #endif
 
 namespace spead2
 {
 
-#if SPEAD2_USE_FMV || !SPEAD2_USE_MOVNTDQ
-SPEAD2_FMV_TARGET("default")
-void *memcpy_nontemporal(void * __restrict__ dest, const void * __restrict__ src, std::size_t n) noexcept
+#if SPEAD2_USE_FMV
+
+extern "C"
+{
+
+static void *(*resolve_memcpy_nontemporal())(void *, const void *, std::size_t)
 {
-    return std::memcpy(dest, src, n);
+    __builtin_cpu_init();
+    /* On Skylake server, AVX-512 reduces clock speeds. Use the logic as Glibc to
+     * decide whether AVX-512 is okay: it's okay if either AVX512ER or
+     * AVX512-VNNI is present. Glibc only applies that logic to Intel CPUs, but
+     * AMD introduced AVX-512 with Zen 4 which also supports AVX512-VNNI (and
+     * performs well), so we don't need to distinguish.
+     */
+#if SPEAD2_USE_AVX512_STREAM
+    if (__builtin_cpu_supports("avx512f")
+        && (__builtin_cpu_supports("avx512er") || __builtin_cpu_supports("avx512vnni")))
+        return memcpy_nontemporal_avx512;
+#endif
+#if SPEAD2_USE_AVX_STREAM
+    if (__builtin_cpu_supports("avx"))
+        return memcpy_nontemporal_avx;
+#endif
+#if SPEAD2_USE_SSE2_STREAM
+    if (__builtin_cpu_supports("sse2"))
+        return memcpy_nontemporal_sse2;
+#endif
+    return std::memcpy;
 }
-#endif // SPEAD2_USE_FMV || !SPEAD2_USE_MOVNTDQ
 
-#if SPEAD2_USE_MOVNTDQ
-SPEAD2_FMV_TARGET("sse2")
-void *memcpy_nontemporal(void * __restrict__ dest, const void * __restrict__ src, std::size_t n) noexcept
+} // extern "C"
+
+[[gnu::ifunc("resolve_memcpy_nontemporal")]]
+void *memcpy_nontemporal(void * __restrict__ dest, const void * __restrict__ src, std::size_t n) noexcept;
+
+#else
+
+void memcpy_nontemporal(void * __restrict__ dest, const void * __restrict__ src, std::size_t n) noexcept
 {
-    char * __restrict__ dest_c = (char *) dest;
-    const char * __restrict__ src_c = (const char *) src;
-    // Align the destination to a cache-line boundary
-    std::uintptr_t dest_i = std::uintptr_t(dest_c);
-    constexpr std::uintptr_t cache_line_mask = detail::cache_line_size - 1;
-    std::uintptr_t aligned = (dest_i + cache_line_mask) & ~cache_line_mask;
-    std::size_t head = aligned - dest_i;
-    if (head > 0)
-    {
-        if (head >= n)
-        {
-            std::memcpy(dest_c, src_c, n);
-            /* Not normally required, but if the destination is
-             * write-combining memory then this will flush the combining
-             * buffers. That may be necessary if the memory is actually on
-             * a GPU or other accelerator.
-             */
-            _mm_sfence();
-            return dest;
-        }
-        std::memcpy(dest_c, src_c, head);
-        dest_c += head;
-        src_c += head;
-        n -= head;
-    }
-    std::size_t offset;
-    for (offset = 0; offset + 64 <= n; offset += 64)
-    {
-        __m128i value0 = _mm_loadu_si128((__m128i const *) (src_c + offset + 0));
-        __m128i value1 = _mm_loadu_si128((__m128i const *) (src_c + offset + 16));
-        __m128i value2 = _mm_loadu_si128((__m128i const *) (src_c + offset + 32));
-        __m128i value3 = _mm_loadu_si128((__m128i const *) (src_c + offset + 48));
-        _mm_stream_si128((__m128i *) (dest_c + offset + 0), value0);
-        _mm_stream_si128((__m128i *) (dest_c + offset + 16), value1);
-        _mm_stream_si128((__m128i *) (dest_c + offset + 32), value2);
-        _mm_stream_si128((__m128i *) (dest_c + offset + 48), value3);
-    }
-    std::size_t tail = n - offset;
-    std::memcpy(dest_c + offset, src_c + offset, tail);
-    _mm_sfence();
-    return dest;
+#if SPEAD2_USE_SSE2_STREAM
+    // We only get here on x86_64, where SSE2 is guaranteed to be supported by hardware
+    return memcpy_nontemporal_sse2(dest, src, n);
+#else
+    return memcpy(dest, src, n);
+#endif
 }
-#endif // SPEAD2_USE_MOVNTDQ
+
+#endif // SPEAD2_USE_FMV
 
 } // namespace spead2
diff --git a/src/common_memcpy_impl.h b/src/common_memcpy_impl.h
new file mode 100644
index 000000000..9b7bfa902
--- /dev/null
+++ b/src/common_memcpy_impl.h
@@ -0,0 +1,113 @@
+/* Copyright 2016, 2020, 2023 National Research Foundation (SARAO)
+ *
+ * This program is free software: you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/**
+ * @file Non-temporal memcpy implementation. This header file is included
+ * multiple times, with the including code providing different macros each
+ * time. While C++ metaprogramming would have been preferable, GCC does not
+ * provide a way to specialise a template function based on the target
+ * options.
+ */
+
+namespace spead2
+{
+
+namespace detail
+{
+
+// U1 and U2 is unrolling factor; Ix is an index sequence 0, ..., Ux-1
+template<int U1, int U2, std::size_t... I1, std::size_t... I2 >
+[[gnu::target(SPEAD2_MEMCPY_TARGET)]]
+static void *SPEAD2_MEMCPY_NAME(
+    void * __restrict__ dest, const void * __restrict__ src, std::size_t n,
+    std::index_sequence<I1...>,
+    std::index_sequence<I2...>) noexcept
+{
+    using T = SPEAD2_MEMCPY_TYPE;
+    char * __restrict__ dest_c = (char *) dest;
+    const char * __restrict__ src_c = (const char *) src;
+    // Align the destination to a cache-line boundary
+    std::uintptr_t dest_i = std::uintptr_t(dest_c);
+    constexpr std::uintptr_t cache_line_mask = cache_line_size - 1;
+    std::uintptr_t aligned = (dest_i + cache_line_mask) & ~cache_line_mask;
+    std::size_t head = aligned - dest_i;
+    if (head > 0)
+    {
+        if (head >= n)
+        {
+            std::memcpy(dest_c, src_c, n);
+            /* Not normally required, but if the destination is
+             * write-combining memory then this will flush the combining
+             * buffers. That may be necessary if the memory is actually on
+             * a GPU or other accelerator.
+             */
+            _mm_sfence();
+            return dest;
+        }
+        std::memcpy(dest_c, src_c, head);
+        dest_c += head;
+        src_c += head;
+        n -= head;
+    }
+    std::size_t offset = 0;
+    for (; offset + U1 * sizeof(T) <= n; offset += U1 * sizeof(T))
+    {
+        T values[U1];
+        /* These fold expressions are really just loops in disguise. They're used
+         * because GCC at -O2 doesn't do a good job of unrolling the loop.
+         */
+        ((values[I1] = SPEAD2_MEMCPY_LOAD((const T *) (src_c + offset + I1 * sizeof(T)))), ...);
+        (SPEAD2_MEMCPY_STORE((T *) (dest_c + offset + I1 * sizeof(T)), values[I1]), ...);
+    }
+    if constexpr (U2 < U1)
+    {
+        for (; offset + U2 * sizeof(T) <= n; offset += U2 * sizeof(T))
+        {
+            T values[U2];
+            /* These fold expressions are really just loops in disguise. They're used
+             * because GCC at -O2 doesn't do a good job of unrolling the loop.
+             */
+            ((values[I2] = SPEAD2_MEMCPY_LOAD((const T *) (src_c + offset + I2 * sizeof(T)))), ...);
+            (SPEAD2_MEMCPY_STORE((T *) (dest_c + offset + I2 * sizeof(T)), values[I2]), ...);
+        }
+    }
+    std::size_t tail = n - offset;
+    std::memcpy(dest_c + offset, src_c + offset, tail);
+    _mm_sfence();
+    return dest;
+}
+
+} // namespace detail
+
+[[gnu::target(SPEAD2_MEMCPY_TARGET)]]
+void *SPEAD2_MEMCPY_NAME(void * __restrict__ dest, const void * __restrict__ src, std::size_t n) noexcept
+{
+    constexpr std::size_t unroll2 = detail::cache_line_size / sizeof(SPEAD2_MEMCPY_TYPE);
+    return detail::SPEAD2_MEMCPY_NAME<SPEAD2_MEMCPY_UNROLL, unroll2>(
+        dest, src, n,
+        std::make_index_sequence<SPEAD2_MEMCPY_UNROLL>(),
+        std::make_index_sequence<unroll2>()
+    );
+}
+
+} // namespace spead2
+
+#undef SPEAD2_MEMCPY_NAME
+#undef SPEAD2_MEMCPY_TARGET
+#undef SPEAD2_MEMCPY_TYPE
+#undef SPEAD2_MEMCPY_LOAD
+#undef SPEAD2_MEMCPY_STORE
+#undef SPEAD2_MEMCPY_UNROLL
diff --git a/src/unittest_memcpy.cpp b/src/unittest_memcpy.cpp
index 8a0c7b127..cc01dd1fa 100644
--- a/src/unittest_memcpy.cpp
+++ b/src/unittest_memcpy.cpp
@@ -21,22 +21,70 @@
  */
 
 #include <boost/test/unit_test.hpp>
+#include <boost/test/data/test_case.hpp>
 #include <utility>
 #include <cstdint>
+#include <ostream>
 #include <spead2/common_memcpy.h>
 
+/* Declare the implementations of the instruction-specific implementations, so
+ * that we can test all of them (that the current CPU supports) rather than
+ * just the one selected by the resolver.
+ */
+namespace spead2
+{
+#if SPEAD2_USE_SSE2_STREAM
+void *memcpy_nontemporal_sse2(void * __restrict__ dest, const void * __restrict__ src, std::size_t n) noexcept;
+#endif
+#if SPEAD2_USE_AVX_STREAM
+void *memcpy_nontemporal_avx(void * __restrict__ dest, const void * __restrict__ src, std::size_t n) noexcept;
+#endif
+#if SPEAD2_USE_AVX512_STREAM
+void *memcpy_nontemporal_avx512(void * __restrict__ dest, const void * __restrict__ src, std::size_t n) noexcept;
+#endif
+} // namespace spead2
+
 namespace spead2::unittest
 {
 
 BOOST_AUTO_TEST_SUITE(common)
 BOOST_AUTO_TEST_SUITE(memcpy)
 
-// Checks every combination of src and dest alignment relative to a page
-BOOST_AUTO_TEST_CASE(memcpy_nontemporal_alignments)
+struct memcpy_function
 {
-    constexpr int head_pad = 32;
-    constexpr int tail_pad = 32;
-    constexpr int max_len = 128;
+    const char * name;
+    void *(*func)(void * __restrict__, const void * __restrict__, std::size_t) noexcept;
+    bool enabled;
+};
+
+std::ostream &operator<<(std::ostream &o, const memcpy_function &func)
+{
+    return o << func.name;
+}
+
+static const memcpy_function memcpy_functions[] =
+{
+    { "default", spead2::memcpy_nontemporal, true },
+#if SPEAD2_USE_SSE2_STREAM
+    { "sse2", spead2::memcpy_nontemporal_sse2, bool(__builtin_cpu_supports("sse2")) },
+#endif
+#if SPEAD2_USE_AVX_STREAM
+    { "avx", spead2::memcpy_nontemporal_avx, bool(__builtin_cpu_supports("avx")) },
+#endif
+#if SPEAD2_USE_AVX512_STREAM
+    { "avx512", spead2::memcpy_nontemporal_avx512, bool(__builtin_cpu_supports("avx512f")) },
+#endif
+};
+
+// Checks combinations of src and dest alignment relative to a page
+BOOST_DATA_TEST_CASE(memcpy_nontemporal_alignments, boost::unit_test::data::make(memcpy_functions), sample)
+{
+    if (!sample.enabled)
+        return;
+
+    constexpr int head_pad = 64;
+    constexpr int tail_pad = 64;
+    constexpr int max_len = 1024;
     constexpr int align_range = 64;
     constexpr int buffer_size = head_pad + align_range + max_len + tail_pad;
 
@@ -45,13 +93,14 @@ BOOST_AUTO_TEST_CASE(memcpy_nontemporal_alignments)
     std::uint8_t expected[buffer_size];
     for (int i = 0; i < align_range; i++)
         for (int j = 0; j < align_range; j++)
-            for (int len = 0; len <= max_len; len++)
+            // Step 1 at a time up to 128, then take larger steps to reduce test time
+            for (int len = 0; len <= max_len; len = (len < 128) ? len + 1 : len + 15)
             {
                 std::memset(dest_buffer, 255, sizeof(dest_buffer));
                 for (int k = 0; k < buffer_size; k++)
                     src_buffer[k] = k % 255;
-                spead2::memcpy_nontemporal(dest_buffer + head_pad + i,
-                                           src_buffer + head_pad + j, len);
+                void *ret = sample.func(dest_buffer + head_pad + i, src_buffer + head_pad + j, len);
+                BOOST_TEST(ret == dest_buffer + head_pad + i);
 
                 std::memset(expected, 255, sizeof(expected));
                 for (int k = 0; k < len; k++)