diff --git a/doc/migrate-4.rst b/doc/migrate-4.rst index c6000dee0..4e0d67cfe 100644 --- a/doc/migrate-4.rst +++ b/doc/migrate-4.rst @@ -107,7 +107,7 @@ autotools meson ``--without-posix-semaphores`` ``posix_semaphores=disabled`` ``--without-pthread_setaffinity_np`` ``pthread_setaffinity_np=disabled`` ``--without-fmv`` ``fmv=disabled`` -``--without-movntdq`` ``movntdq=disabled`` +``--without-movntdq`` ``sse2_stream=disabled`` ``--without-cuda`` ``cuda=disabled`` ``--without-gdrapi`` ``gdrapi=disabled`` ====================================== ===================================== diff --git a/include/spead2/common_defines.h b/include/spead2/common_defines.h index bd35ad223..8d9a977a7 100644 --- a/include/spead2/common_defines.h +++ b/include/spead2/common_defines.h @@ -34,12 +34,6 @@ #define SPEAD2_MAX_LOG_LEVEL (spead2::log_level::info) #endif -#if SPEAD2_USE_FMV -# define SPEAD2_FMV_TARGET(x) [[gnu::target(x)]] -#else -# define SPEAD2_FMV_TARGET(x) -#endif - /** * SPEAD protocol sending and receiving. All SPEAD-64-* flavours are * supported. diff --git a/include/spead2/common_features.h.in b/include/spead2/common_features.h.in index 0cf104afd..89242bfa7 100644 --- a/include/spead2/common_features.h.in +++ b/include/spead2/common_features.h.in @@ -38,25 +38,33 @@ /* Python on MacOS likes to build universal binaries, which causes problems * because it doesn't match the compilation environment detected at * configuration time. So features that we only use on x86 are only enabled - * if x86 is actually detected at build time. + * if x86 is actually detected at build time, and if function + * multi-versioning is available. */ #if defined(__i386__) || defined(__i386) || defined(__x86_64__) || defined(__x86_64) # define SPEAD2_USE_FMV @SPEAD2_USE_FMV@ -/* On i386, MOVNTDQ is not guaranteed to exist at runtime, and we need function - * multi-versioning to make it safe to use. - */ -# if SPEAD2_USE_FMV || defined(__x86_64__) || defined(__x86_64) -# define SPEAD2_USE_MOVNTDQ @SPEAD2_USE_MOVNTDQ@ -# else -# define SPEAD2_USE_MOVNTDQ 0 +# if SPEAD2_USE_FMV +# define SPEAD2_USE_SSE2_STREAM @SPEAD2_USE_SSE2_STREAM@ +# define SPEAD2_USE_AVX_STREAM @SPEAD2_USE_AVX_STREAM@ +# define SPEAD2_USE_AVX512_STREAM @SPEAD2_USE_AVX512_STREAM@ +# elif defined(__x86_64__) || defined(__x86_64) +# define SPEAD2_USE_SSE2_STREAM @SPEAD2_USE_SSE2_STREAM@ /* Guaranteed to exist on x86_64 */ # endif -#else // not x86 +#endif +#ifndef SPEAD2_USE_FMV # define SPEAD2_USE_FMV 0 -# define SPEAD2_USE_MOVNTDQ 0 - +#endif +#ifndef SPEAD2_USE_SSE2_STREAM +# define SPEAD2_USE_SSE2_STREAM 0 +#endif +#ifndef SPEAD2_USE_AVX_STREAM +# define SPEAD2_USE_AVX_STREAM 0 +#endif +#ifndef SPEAD2_USE_AVX512_STREAM +# define SPEAD2_USE_AVX512_STREAM 0 #endif #define SPEAD2_USE_POSIX_SEMAPHORES @SPEAD2_USE_POSIX_SEMAPHORES@ diff --git a/include/spead2/common_memcpy.h b/include/spead2/common_memcpy.h index a522d7448..e44f97fdd 100644 --- a/include/spead2/common_memcpy.h +++ b/include/spead2/common_memcpy.h @@ -31,15 +31,7 @@ namespace spead2 { -#if SPEAD2_USE_FMV || !SPEAD2_USE_MOVNTDQ -SPEAD2_FMV_TARGET("default") void *memcpy_nontemporal(void * __restrict__ dest, const void * __restrict__ src, std::size_t n) noexcept; -#endif - -#if SPEAD2_USE_MOVNTDQ -SPEAD2_FMV_TARGET("sse2") -void *memcpy_nontemporal(void * __restrict__ dest, const void * __restrict__ src, std::size_t n) noexcept; -#endif } // namespace spead2 diff --git a/meson.build b/meson.build index 518af5a28..13ff12eac 100644 --- a/meson.build +++ b/meson.build @@ -191,26 +191,49 @@ use_pthread_setaffinity_np = get_option('pthread_setaffinity_np').require( dependencies : thread_dep ) ).allowed() -use_fmv = get_option('fmv').require( +use_fmv = get_option('fmv').require(compiler.has_function_attribute('ifunc')).allowed() +# has_function doesn't work for _mm_stream_si128 and friends, because they +# are inline-only functions in GCC without external definitions. +use_sse2_stream = get_option('sse2_stream').require( compiler.compiles( - '__attribute__((target("default"))) void foo() {}', - args : '-Werror', # TODO: use werror kwarg once meson 1.3 is in use - name : 'function multi-versioning' + ''' + #include + + [[gnu::target("sse2")]] + void foo() + { + _mm_stream_si128((__m128i *) NULL, __m128i()); + } + ''', + name : 'SSE2 streaming intrinsic' ) ).allowed() -# has_function doesn't work for _mm_stream_si128, because it is an -# inline-only function in GCC without an external definition. -use_movntdq = get_option('movntdq').require( +use_avx_stream = get_option('avx_stream').require( compiler.compiles( ''' - #include + #include + [[gnu::target("avx")]] void foo() { - _mm_stream_si128((__m128i *) NULL, __m128i()); + _mm256_stream_si256((__m256i *) NULL, __m256i()); + } + ''', + name : 'AVX streaming intrinsic' + ) +).allowed() +use_avx512_stream = get_option('avx512_stream').require( + compiler.compiles( + ''' + #include + + [[gnu::target("avx512f")]] + void foo() + { + _mm512_stream_si512((__m512i *) NULL, __m512i()); } ''', - name : 'MOVNTDQ intrinsic' + name : 'AVX-512 streaming intrinsic' ) ).allowed() @@ -230,7 +253,9 @@ conf.set10('SPEAD2_USE_EVENTFD', use_eventfd) conf.set10('SPEAD2_USE_POSIX_SEMAPHORES', use_posix_semaphores) conf.set10('SPEAD2_USE_PTHREAD_SETAFFINITY_NP', use_pthread_setaffinity_np) conf.set10('SPEAD2_USE_FMV', use_fmv) -conf.set10('SPEAD2_USE_MOVNTDQ', use_movntdq) +conf.set10('SPEAD2_USE_SSE2_STREAM', use_sse2_stream) +conf.set10('SPEAD2_USE_AVX_STREAM', use_avx_stream) +conf.set10('SPEAD2_USE_AVX512_STREAM', use_avx512_stream) conf.set10('SPEAD2_USE_PCAP', pcap_dep.found()) conf.set('SPEAD2_MAX_LOG_LEVEL', '(spead2::log_level::' + get_option('max_log_level') + ')') diff --git a/meson.options b/meson.options index a4ada16e0..65187a277 100644 --- a/meson.options +++ b/meson.options @@ -25,7 +25,9 @@ option('eventfd', type : 'feature', description : 'Use eventfd system call for s option('posix_semaphores', type : 'feature', description : 'Use POSIX semaphores') option('pthread_setaffinity_np', type : 'feature', description : 'Use pthread_setaffinity_np to set thread affinity') option('fmv', type : 'feature', description : 'Use function multi-versioning') -option('movntdq', type : 'feature', description : 'Use MOVNTDQ instruction for non-temporal stores') +option('sse2_stream', type : 'feature', description : 'Use SSE2 for non-temporal stores') +option('avx_stream', type : 'feature', description : 'Use AVX for non-temporal stores') +option('avx512_stream', type : 'feature', description : 'Use AVX-512 for non-temporal stores') option('cuda', type : 'feature', description : 'Build CUDA examples') option('gdrapi', type : 'feature', description : 'Build gdrcopy examples') option('unit_test', type : 'feature', description : 'Build the unit tests') diff --git a/src/common_memcpy.cpp b/src/common_memcpy.cpp index 860ce43ec..59db889ce 100644 --- a/src/common_memcpy.cpp +++ b/src/common_memcpy.cpp @@ -1,4 +1,4 @@ -/* Copyright 2016, 2020 National Research Foundation (SARAO) +/* Copyright 2016, 2020, 2023 National Research Foundation (SARAO) * * This program is free software: you can redistribute it and/or modify it under * the terms of the GNU Lesser General Public License as published by the Free @@ -17,70 +17,94 @@ #include #include #include +#include #include #include #include -#if SPEAD2_USE_MOVNTDQ + +#if SPEAD2_USE_SSE2_STREAM # include +# define SPEAD2_MEMCPY_NAME memcpy_nontemporal_sse2 +# define SPEAD2_MEMCPY_TARGET "sse2" +# define SPEAD2_MEMCPY_TYPE __m128i +# define SPEAD2_MEMCPY_LOAD _mm_loadu_si128 +# define SPEAD2_MEMCPY_STORE _mm_stream_si128 +# define SPEAD2_MEMCPY_UNROLL 16 +# include "common_memcpy_impl.h" +#endif + +#if SPEAD2_USE_AVX_STREAM +# include +# define SPEAD2_MEMCPY_NAME memcpy_nontemporal_avx +# define SPEAD2_MEMCPY_TARGET "avx" +# define SPEAD2_MEMCPY_TYPE __m256i +# define SPEAD2_MEMCPY_LOAD _mm256_loadu_si256 +# define SPEAD2_MEMCPY_STORE _mm256_stream_si256 +# define SPEAD2_MEMCPY_UNROLL 8 +# include "common_memcpy_impl.h" +#endif + +#if SPEAD2_USE_AVX512_STREAM +# include +# define SPEAD2_MEMCPY_NAME memcpy_nontemporal_avx512 +# define SPEAD2_MEMCPY_TARGET "avx512f" +# define SPEAD2_MEMCPY_TYPE __m512i +# define SPEAD2_MEMCPY_LOAD _mm512_loadu_si512 +# define SPEAD2_MEMCPY_STORE _mm512_stream_si512 +# define SPEAD2_MEMCPY_UNROLL 8 +# include "common_memcpy_impl.h" #endif namespace spead2 { -#if SPEAD2_USE_FMV || !SPEAD2_USE_MOVNTDQ -SPEAD2_FMV_TARGET("default") -void *memcpy_nontemporal(void * __restrict__ dest, const void * __restrict__ src, std::size_t n) noexcept +#if SPEAD2_USE_FMV + +extern "C" +{ + +static void *(*resolve_memcpy_nontemporal())(void *, const void *, std::size_t) { - return std::memcpy(dest, src, n); + __builtin_cpu_init(); + /* On Skylake server, AVX-512 reduces clock speeds. Use the logic as Glibc to + * decide whether AVX-512 is okay: it's okay if either AVX512ER or + * AVX512-VNNI is present. Glibc only applies that logic to Intel CPUs, but + * AMD introduced AVX-512 with Zen 4 which also supports AVX512-VNNI (and + * performs well), so we don't need to distinguish. + */ +#if SPEAD2_USE_AVX512_STREAM + if (__builtin_cpu_supports("avx512f") + && (__builtin_cpu_supports("avx512er") || __builtin_cpu_supports("avx512vnni"))) + return memcpy_nontemporal_avx512; +#endif +#if SPEAD2_USE_AVX_STREAM + if (__builtin_cpu_supports("avx")) + return memcpy_nontemporal_avx; +#endif +#if SPEAD2_USE_SSE2_STREAM + if (__builtin_cpu_supports("sse2")) + return memcpy_nontemporal_sse2; +#endif + return std::memcpy; } -#endif // SPEAD2_USE_FMV || !SPEAD2_USE_MOVNTDQ -#if SPEAD2_USE_MOVNTDQ -SPEAD2_FMV_TARGET("sse2") -void *memcpy_nontemporal(void * __restrict__ dest, const void * __restrict__ src, std::size_t n) noexcept +} // extern "C" + +[[gnu::ifunc("resolve_memcpy_nontemporal")]] +void *memcpy_nontemporal(void * __restrict__ dest, const void * __restrict__ src, std::size_t n) noexcept; + +#else + +void memcpy_nontemporal(void * __restrict__ dest, const void * __restrict__ src, std::size_t n) noexcept { - char * __restrict__ dest_c = (char *) dest; - const char * __restrict__ src_c = (const char *) src; - // Align the destination to a cache-line boundary - std::uintptr_t dest_i = std::uintptr_t(dest_c); - constexpr std::uintptr_t cache_line_mask = detail::cache_line_size - 1; - std::uintptr_t aligned = (dest_i + cache_line_mask) & ~cache_line_mask; - std::size_t head = aligned - dest_i; - if (head > 0) - { - if (head >= n) - { - std::memcpy(dest_c, src_c, n); - /* Not normally required, but if the destination is - * write-combining memory then this will flush the combining - * buffers. That may be necessary if the memory is actually on - * a GPU or other accelerator. - */ - _mm_sfence(); - return dest; - } - std::memcpy(dest_c, src_c, head); - dest_c += head; - src_c += head; - n -= head; - } - std::size_t offset; - for (offset = 0; offset + 64 <= n; offset += 64) - { - __m128i value0 = _mm_loadu_si128((__m128i const *) (src_c + offset + 0)); - __m128i value1 = _mm_loadu_si128((__m128i const *) (src_c + offset + 16)); - __m128i value2 = _mm_loadu_si128((__m128i const *) (src_c + offset + 32)); - __m128i value3 = _mm_loadu_si128((__m128i const *) (src_c + offset + 48)); - _mm_stream_si128((__m128i *) (dest_c + offset + 0), value0); - _mm_stream_si128((__m128i *) (dest_c + offset + 16), value1); - _mm_stream_si128((__m128i *) (dest_c + offset + 32), value2); - _mm_stream_si128((__m128i *) (dest_c + offset + 48), value3); - } - std::size_t tail = n - offset; - std::memcpy(dest_c + offset, src_c + offset, tail); - _mm_sfence(); - return dest; +#if SPEAD2_USE_SSE2_STREAM + // We only get here on x86_64, where SSE2 is guaranteed to be supported by hardware + return memcpy_nontemporal_sse2(dest, src, n); +#else + return memcpy(dest, src, n); +#endif } -#endif // SPEAD2_USE_MOVNTDQ + +#endif // SPEAD2_USE_FMV } // namespace spead2 diff --git a/src/common_memcpy_impl.h b/src/common_memcpy_impl.h new file mode 100644 index 000000000..9b7bfa902 --- /dev/null +++ b/src/common_memcpy_impl.h @@ -0,0 +1,113 @@ +/* Copyright 2016, 2020, 2023 National Research Foundation (SARAO) + * + * This program is free software: you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more + * details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. If not, see . + */ + +/** + * @file Non-temporal memcpy implementation. This header file is included + * multiple times, with the including code providing different macros each + * time. While C++ metaprogramming would have been preferable, GCC does not + * provide a way to specialise a template function based on the target + * options. + */ + +namespace spead2 +{ + +namespace detail +{ + +// U1 and U2 is unrolling factor; Ix is an index sequence 0, ..., Ux-1 +template +[[gnu::target(SPEAD2_MEMCPY_TARGET)]] +static void *SPEAD2_MEMCPY_NAME( + void * __restrict__ dest, const void * __restrict__ src, std::size_t n, + std::index_sequence, + std::index_sequence) noexcept +{ + using T = SPEAD2_MEMCPY_TYPE; + char * __restrict__ dest_c = (char *) dest; + const char * __restrict__ src_c = (const char *) src; + // Align the destination to a cache-line boundary + std::uintptr_t dest_i = std::uintptr_t(dest_c); + constexpr std::uintptr_t cache_line_mask = cache_line_size - 1; + std::uintptr_t aligned = (dest_i + cache_line_mask) & ~cache_line_mask; + std::size_t head = aligned - dest_i; + if (head > 0) + { + if (head >= n) + { + std::memcpy(dest_c, src_c, n); + /* Not normally required, but if the destination is + * write-combining memory then this will flush the combining + * buffers. That may be necessary if the memory is actually on + * a GPU or other accelerator. + */ + _mm_sfence(); + return dest; + } + std::memcpy(dest_c, src_c, head); + dest_c += head; + src_c += head; + n -= head; + } + std::size_t offset = 0; + for (; offset + U1 * sizeof(T) <= n; offset += U1 * sizeof(T)) + { + T values[U1]; + /* These fold expressions are really just loops in disguise. They're used + * because GCC at -O2 doesn't do a good job of unrolling the loop. + */ + ((values[I1] = SPEAD2_MEMCPY_LOAD((const T *) (src_c + offset + I1 * sizeof(T)))), ...); + (SPEAD2_MEMCPY_STORE((T *) (dest_c + offset + I1 * sizeof(T)), values[I1]), ...); + } + if constexpr (U2 < U1) + { + for (; offset + U2 * sizeof(T) <= n; offset += U2 * sizeof(T)) + { + T values[U2]; + /* These fold expressions are really just loops in disguise. They're used + * because GCC at -O2 doesn't do a good job of unrolling the loop. + */ + ((values[I2] = SPEAD2_MEMCPY_LOAD((const T *) (src_c + offset + I2 * sizeof(T)))), ...); + (SPEAD2_MEMCPY_STORE((T *) (dest_c + offset + I2 * sizeof(T)), values[I2]), ...); + } + } + std::size_t tail = n - offset; + std::memcpy(dest_c + offset, src_c + offset, tail); + _mm_sfence(); + return dest; +} + +} // namespace detail + +[[gnu::target(SPEAD2_MEMCPY_TARGET)]] +void *SPEAD2_MEMCPY_NAME(void * __restrict__ dest, const void * __restrict__ src, std::size_t n) noexcept +{ + constexpr std::size_t unroll2 = detail::cache_line_size / sizeof(SPEAD2_MEMCPY_TYPE); + return detail::SPEAD2_MEMCPY_NAME( + dest, src, n, + std::make_index_sequence(), + std::make_index_sequence() + ); +} + +} // namespace spead2 + +#undef SPEAD2_MEMCPY_NAME +#undef SPEAD2_MEMCPY_TARGET +#undef SPEAD2_MEMCPY_TYPE +#undef SPEAD2_MEMCPY_LOAD +#undef SPEAD2_MEMCPY_STORE +#undef SPEAD2_MEMCPY_UNROLL diff --git a/src/unittest_memcpy.cpp b/src/unittest_memcpy.cpp index 8a0c7b127..cc01dd1fa 100644 --- a/src/unittest_memcpy.cpp +++ b/src/unittest_memcpy.cpp @@ -21,22 +21,70 @@ */ #include +#include #include #include +#include #include +/* Declare the implementations of the instruction-specific implementations, so + * that we can test all of them (that the current CPU supports) rather than + * just the one selected by the resolver. + */ +namespace spead2 +{ +#if SPEAD2_USE_SSE2_STREAM +void *memcpy_nontemporal_sse2(void * __restrict__ dest, const void * __restrict__ src, std::size_t n) noexcept; +#endif +#if SPEAD2_USE_AVX_STREAM +void *memcpy_nontemporal_avx(void * __restrict__ dest, const void * __restrict__ src, std::size_t n) noexcept; +#endif +#if SPEAD2_USE_AVX512_STREAM +void *memcpy_nontemporal_avx512(void * __restrict__ dest, const void * __restrict__ src, std::size_t n) noexcept; +#endif +} // namespace spead2 + namespace spead2::unittest { BOOST_AUTO_TEST_SUITE(common) BOOST_AUTO_TEST_SUITE(memcpy) -// Checks every combination of src and dest alignment relative to a page -BOOST_AUTO_TEST_CASE(memcpy_nontemporal_alignments) +struct memcpy_function { - constexpr int head_pad = 32; - constexpr int tail_pad = 32; - constexpr int max_len = 128; + const char * name; + void *(*func)(void * __restrict__, const void * __restrict__, std::size_t) noexcept; + bool enabled; +}; + +std::ostream &operator<<(std::ostream &o, const memcpy_function &func) +{ + return o << func.name; +} + +static const memcpy_function memcpy_functions[] = +{ + { "default", spead2::memcpy_nontemporal, true }, +#if SPEAD2_USE_SSE2_STREAM + { "sse2", spead2::memcpy_nontemporal_sse2, bool(__builtin_cpu_supports("sse2")) }, +#endif +#if SPEAD2_USE_AVX_STREAM + { "avx", spead2::memcpy_nontemporal_avx, bool(__builtin_cpu_supports("avx")) }, +#endif +#if SPEAD2_USE_AVX512_STREAM + { "avx512", spead2::memcpy_nontemporal_avx512, bool(__builtin_cpu_supports("avx512f")) }, +#endif +}; + +// Checks combinations of src and dest alignment relative to a page +BOOST_DATA_TEST_CASE(memcpy_nontemporal_alignments, boost::unit_test::data::make(memcpy_functions), sample) +{ + if (!sample.enabled) + return; + + constexpr int head_pad = 64; + constexpr int tail_pad = 64; + constexpr int max_len = 1024; constexpr int align_range = 64; constexpr int buffer_size = head_pad + align_range + max_len + tail_pad; @@ -45,13 +93,14 @@ BOOST_AUTO_TEST_CASE(memcpy_nontemporal_alignments) std::uint8_t expected[buffer_size]; for (int i = 0; i < align_range; i++) for (int j = 0; j < align_range; j++) - for (int len = 0; len <= max_len; len++) + // Step 1 at a time up to 128, then take larger steps to reduce test time + for (int len = 0; len <= max_len; len = (len < 128) ? len + 1 : len + 15) { std::memset(dest_buffer, 255, sizeof(dest_buffer)); for (int k = 0; k < buffer_size; k++) src_buffer[k] = k % 255; - spead2::memcpy_nontemporal(dest_buffer + head_pad + i, - src_buffer + head_pad + j, len); + void *ret = sample.func(dest_buffer + head_pad + i, src_buffer + head_pad + j, len); + BOOST_TEST(ret == dest_buffer + head_pad + i); std::memset(expected, 255, sizeof(expected)); for (int k = 0; k < len; k++)