Skip to content

Commit

Permalink
Add AVX and AVX-512 non-temporal memcpy implementations
Browse files Browse the repository at this point in the history
The implementation has been substantially changed, so that there is an
explicit ifunc resolver, instead of using GCC's function
multi-versioning. This allows AVX-512 to be more conditionally enabled,
and also makes it possible to test all the implementations in the unit
test.

The unrolling factor is now also easier to control, and is set to 16 for
the SSE2 implementation since there are signs that this can marginally
improve performance.
  • Loading branch information
bmerry committed Oct 23, 2023
1 parent 0fd8670 commit 0c50368
Show file tree
Hide file tree
Showing 9 changed files with 305 additions and 98 deletions.
2 changes: 1 addition & 1 deletion doc/migrate-4.rst
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ autotools meson
``--without-posix-semaphores`` ``posix_semaphores=disabled``
``--without-pthread_setaffinity_np`` ``pthread_setaffinity_np=disabled``
``--without-fmv`` ``fmv=disabled``
``--without-movntdq`` ``movntdq=disabled``
``--without-movntdq`` ``sse2_stream=disabled``
``--without-cuda`` ``cuda=disabled``
``--without-gdrapi`` ``gdrapi=disabled``
====================================== =====================================
Expand Down
6 changes: 0 additions & 6 deletions include/spead2/common_defines.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,12 +34,6 @@
#define SPEAD2_MAX_LOG_LEVEL (spead2::log_level::info)
#endif

#if SPEAD2_USE_FMV
# define SPEAD2_FMV_TARGET(x) [[gnu::target(x)]]
#else
# define SPEAD2_FMV_TARGET(x)
#endif

/**
* SPEAD protocol sending and receiving. All SPEAD-64-* flavours are
* supported.
Expand Down
30 changes: 19 additions & 11 deletions include/spead2/common_features.h.in
Original file line number Diff line number Diff line change
Expand Up @@ -38,25 +38,33 @@
/* Python on MacOS likes to build universal binaries, which causes problems
* because it doesn't match the compilation environment detected at
* configuration time. So features that we only use on x86 are only enabled
* if x86 is actually detected at build time.
* if x86 is actually detected at build time, and if function
* multi-versioning is available.
*/
#if defined(__i386__) || defined(__i386) || defined(__x86_64__) || defined(__x86_64)

# define SPEAD2_USE_FMV @SPEAD2_USE_FMV@
/* On i386, MOVNTDQ is not guaranteed to exist at runtime, and we need function
* multi-versioning to make it safe to use.
*/
# if SPEAD2_USE_FMV || defined(__x86_64__) || defined(__x86_64)
# define SPEAD2_USE_MOVNTDQ @SPEAD2_USE_MOVNTDQ@
# else
# define SPEAD2_USE_MOVNTDQ 0
# if SPEAD2_USE_FMV
# define SPEAD2_USE_SSE2_STREAM @SPEAD2_USE_SSE2_STREAM@
# define SPEAD2_USE_AVX_STREAM @SPEAD2_USE_AVX_STREAM@
# define SPEAD2_USE_AVX512_STREAM @SPEAD2_USE_AVX512_STREAM@
# elif defined(__x86_64__) || defined(__x86_64)
# define SPEAD2_USE_SSE2_STREAM @SPEAD2_USE_SSE2_STREAM@ /* Guaranteed to exist on x86_64 */
# endif

#else // not x86
#endif

#ifndef SPEAD2_USE_FMV
# define SPEAD2_USE_FMV 0
# define SPEAD2_USE_MOVNTDQ 0

#endif
#ifndef SPEAD2_USE_SSE2_STREAM
# define SPEAD2_USE_SSE2_STREAM 0
#endif
#ifndef SPEAD2_USE_AVX_STREAM
# define SPEAD2_USE_AVX_STREAM 0
#endif
#ifndef SPEAD2_USE_AVX512_STREAM
# define SPEAD2_USE_AVX512_STREAM 0
#endif

#define SPEAD2_USE_POSIX_SEMAPHORES @SPEAD2_USE_POSIX_SEMAPHORES@
Expand Down
8 changes: 0 additions & 8 deletions include/spead2/common_memcpy.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,15 +31,7 @@
namespace spead2
{

#if SPEAD2_USE_FMV || !SPEAD2_USE_MOVNTDQ
SPEAD2_FMV_TARGET("default")
void *memcpy_nontemporal(void * __restrict__ dest, const void * __restrict__ src, std::size_t n) noexcept;
#endif

#if SPEAD2_USE_MOVNTDQ
SPEAD2_FMV_TARGET("sse2")
void *memcpy_nontemporal(void * __restrict__ dest, const void * __restrict__ src, std::size_t n) noexcept;
#endif

} // namespace spead2

Expand Down
47 changes: 36 additions & 11 deletions meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -191,26 +191,49 @@ use_pthread_setaffinity_np = get_option('pthread_setaffinity_np').require(
dependencies : thread_dep
)
).allowed()
use_fmv = get_option('fmv').require(
use_fmv = get_option('fmv').require(compiler.has_function_attribute('ifunc')).allowed()
# has_function doesn't work for _mm_stream_si128 and friends, because they
# are inline-only functions in GCC without external definitions.
use_sse2_stream = get_option('sse2_stream').require(
compiler.compiles(
'__attribute__((target("default"))) void foo() {}',
args : '-Werror', # TODO: use werror kwarg once meson 1.3 is in use
name : 'function multi-versioning'
'''
#include <emmintrin.h>
[[gnu::target("sse2")]]
void foo()
{
_mm_stream_si128((__m128i *) NULL, __m128i());
}
''',
name : 'SSE2 streaming intrinsic'
)
).allowed()
# has_function doesn't work for _mm_stream_si128, because it is an
# inline-only function in GCC without an external definition.
use_movntdq = get_option('movntdq').require(
use_avx_stream = get_option('avx_stream').require(
compiler.compiles(
'''
#include <emmintrin.h>
#include <immintrin.h>
[[gnu::target("avx")]]
void foo()
{
_mm_stream_si128((__m128i *) NULL, __m128i());
_mm256_stream_si256((__m256i *) NULL, __m256i());
}
''',
name : 'AVX streaming intrinsic'
)
).allowed()
use_avx512_stream = get_option('avx512_stream').require(
compiler.compiles(
'''
#include <immintrin.h>
[[gnu::target("avx512f")]]
void foo()
{
_mm512_stream_si512((__m512i *) NULL, __m512i());
}
''',
name : 'MOVNTDQ intrinsic'
name : 'AVX-512 streaming intrinsic'
)
).allowed()

Expand All @@ -230,7 +253,9 @@ conf.set10('SPEAD2_USE_EVENTFD', use_eventfd)
conf.set10('SPEAD2_USE_POSIX_SEMAPHORES', use_posix_semaphores)
conf.set10('SPEAD2_USE_PTHREAD_SETAFFINITY_NP', use_pthread_setaffinity_np)
conf.set10('SPEAD2_USE_FMV', use_fmv)
conf.set10('SPEAD2_USE_MOVNTDQ', use_movntdq)
conf.set10('SPEAD2_USE_SSE2_STREAM', use_sse2_stream)
conf.set10('SPEAD2_USE_AVX_STREAM', use_avx_stream)
conf.set10('SPEAD2_USE_AVX512_STREAM', use_avx512_stream)
conf.set10('SPEAD2_USE_PCAP', pcap_dep.found())
conf.set('SPEAD2_MAX_LOG_LEVEL', '(spead2::log_level::' + get_option('max_log_level') + ')')

Expand Down
4 changes: 3 additions & 1 deletion meson.options
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,9 @@ option('eventfd', type : 'feature', description : 'Use eventfd system call for s
option('posix_semaphores', type : 'feature', description : 'Use POSIX semaphores')
option('pthread_setaffinity_np', type : 'feature', description : 'Use pthread_setaffinity_np to set thread affinity')
option('fmv', type : 'feature', description : 'Use function multi-versioning')
option('movntdq', type : 'feature', description : 'Use MOVNTDQ instruction for non-temporal stores')
option('sse2_stream', type : 'feature', description : 'Use SSE2 for non-temporal stores')
option('avx_stream', type : 'feature', description : 'Use AVX for non-temporal stores')
option('avx512_stream', type : 'feature', description : 'Use AVX-512 for non-temporal stores')
option('cuda', type : 'feature', description : 'Build CUDA examples')
option('gdrapi', type : 'feature', description : 'Build gdrcopy examples')
option('unit_test', type : 'feature', description : 'Build the unit tests')
Expand Down
128 changes: 76 additions & 52 deletions src/common_memcpy.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
/* Copyright 2016, 2020 National Research Foundation (SARAO)
/* Copyright 2016, 2020, 2023 National Research Foundation (SARAO)
*
* This program is free software: you can redistribute it and/or modify it under
* the terms of the GNU Lesser General Public License as published by the Free
Expand All @@ -17,70 +17,94 @@
#include <cstddef>
#include <cstdint>
#include <cstring>
#include <utility>
#include <spead2/common_defines.h>
#include <spead2/common_features.h>
#include <spead2/common_memcpy.h>
#if SPEAD2_USE_MOVNTDQ

#if SPEAD2_USE_SSE2_STREAM
# include <emmintrin.h>
# define SPEAD2_MEMCPY_NAME memcpy_nontemporal_sse2
# define SPEAD2_MEMCPY_TARGET "sse2"
# define SPEAD2_MEMCPY_TYPE __m128i
# define SPEAD2_MEMCPY_LOAD _mm_loadu_si128
# define SPEAD2_MEMCPY_STORE _mm_stream_si128
# define SPEAD2_MEMCPY_UNROLL 16
# include "common_memcpy_impl.h"
#endif

#if SPEAD2_USE_AVX_STREAM
# include <immintrin.h>
# define SPEAD2_MEMCPY_NAME memcpy_nontemporal_avx
# define SPEAD2_MEMCPY_TARGET "avx"
# define SPEAD2_MEMCPY_TYPE __m256i
# define SPEAD2_MEMCPY_LOAD _mm256_loadu_si256
# define SPEAD2_MEMCPY_STORE _mm256_stream_si256
# define SPEAD2_MEMCPY_UNROLL 8
# include "common_memcpy_impl.h"
#endif

#if SPEAD2_USE_AVX512_STREAM
# include <immintrin.h>
# define SPEAD2_MEMCPY_NAME memcpy_nontemporal_avx512
# define SPEAD2_MEMCPY_TARGET "avx512f"
# define SPEAD2_MEMCPY_TYPE __m512i
# define SPEAD2_MEMCPY_LOAD _mm512_loadu_si512
# define SPEAD2_MEMCPY_STORE _mm512_stream_si512
# define SPEAD2_MEMCPY_UNROLL 8
# include "common_memcpy_impl.h"
#endif

namespace spead2
{

#if SPEAD2_USE_FMV || !SPEAD2_USE_MOVNTDQ
SPEAD2_FMV_TARGET("default")
void *memcpy_nontemporal(void * __restrict__ dest, const void * __restrict__ src, std::size_t n) noexcept
#if SPEAD2_USE_FMV

extern "C"
{

static void *(*resolve_memcpy_nontemporal())(void *, const void *, std::size_t)
{
return std::memcpy(dest, src, n);
__builtin_cpu_init();
/* On Skylake server, AVX-512 reduces clock speeds. Use the logic as Glibc to
* decide whether AVX-512 is okay: it's okay if either AVX512ER or
* AVX512-VNNI is present. Glibc only applies that logic to Intel CPUs, but
* AMD introduced AVX-512 with Zen 4 which also supports AVX512-VNNI (and
* performs well), so we don't need to distinguish.
*/
#if SPEAD2_USE_AVX512_STREAM
if (__builtin_cpu_supports("avx512f")
&& (__builtin_cpu_supports("avx512er") || __builtin_cpu_supports("avx512vnni")))
return memcpy_nontemporal_avx512;
#endif
#if SPEAD2_USE_AVX_STREAM
if (__builtin_cpu_supports("avx"))
return memcpy_nontemporal_avx;
#endif
#if SPEAD2_USE_SSE2_STREAM
if (__builtin_cpu_supports("sse2"))
return memcpy_nontemporal_sse2;
#endif
return std::memcpy;
}
#endif // SPEAD2_USE_FMV || !SPEAD2_USE_MOVNTDQ

#if SPEAD2_USE_MOVNTDQ
SPEAD2_FMV_TARGET("sse2")
void *memcpy_nontemporal(void * __restrict__ dest, const void * __restrict__ src, std::size_t n) noexcept
} // extern "C"

[[gnu::ifunc("resolve_memcpy_nontemporal")]]
void *memcpy_nontemporal(void * __restrict__ dest, const void * __restrict__ src, std::size_t n) noexcept;

#else

void memcpy_nontemporal(void * __restrict__ dest, const void * __restrict__ src, std::size_t n) noexcept
{
char * __restrict__ dest_c = (char *) dest;
const char * __restrict__ src_c = (const char *) src;
// Align the destination to a cache-line boundary
std::uintptr_t dest_i = std::uintptr_t(dest_c);
constexpr std::uintptr_t cache_line_mask = detail::cache_line_size - 1;
std::uintptr_t aligned = (dest_i + cache_line_mask) & ~cache_line_mask;
std::size_t head = aligned - dest_i;
if (head > 0)
{
if (head >= n)
{
std::memcpy(dest_c, src_c, n);
/* Not normally required, but if the destination is
* write-combining memory then this will flush the combining
* buffers. That may be necessary if the memory is actually on
* a GPU or other accelerator.
*/
_mm_sfence();
return dest;
}
std::memcpy(dest_c, src_c, head);
dest_c += head;
src_c += head;
n -= head;
}
std::size_t offset;
for (offset = 0; offset + 64 <= n; offset += 64)
{
__m128i value0 = _mm_loadu_si128((__m128i const *) (src_c + offset + 0));
__m128i value1 = _mm_loadu_si128((__m128i const *) (src_c + offset + 16));
__m128i value2 = _mm_loadu_si128((__m128i const *) (src_c + offset + 32));
__m128i value3 = _mm_loadu_si128((__m128i const *) (src_c + offset + 48));
_mm_stream_si128((__m128i *) (dest_c + offset + 0), value0);
_mm_stream_si128((__m128i *) (dest_c + offset + 16), value1);
_mm_stream_si128((__m128i *) (dest_c + offset + 32), value2);
_mm_stream_si128((__m128i *) (dest_c + offset + 48), value3);
}
std::size_t tail = n - offset;
std::memcpy(dest_c + offset, src_c + offset, tail);
_mm_sfence();
return dest;
#if SPEAD2_USE_SSE2_STREAM
// We only get here on x86_64, where SSE2 is guaranteed to be supported by hardware
return memcpy_nontemporal_sse2(dest, src, n);
#else
return memcpy(dest, src, n);
#endif
}
#endif // SPEAD2_USE_MOVNTDQ

#endif // SPEAD2_USE_FMV

} // namespace spead2
Loading

0 comments on commit 0c50368

Please sign in to comment.