-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathfallback_builtins.h
133 lines (116 loc) · 5.69 KB
/
fallback_builtins.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
#ifndef FALLBACK_BUILTINS_H
#define FALLBACK_BUILTINS_H
#if defined(_MSC_VER) && !defined(__clang__)
#if defined(_M_IX86) || defined(_M_AMD64) || defined(_M_IA64) || defined(_M_ARM) || defined(_M_ARM64)
#include <intrin.h>
#ifdef X86_FEATURES
# include "arch/x86/x86_features.h"
#endif
/* This is not a general purpose replacement for __builtin_ctz. The function expects that value is != 0.
* Because of that assumption trailing_zero is not initialized and the return value is not checked.
* Tzcnt and bsf give identical results except when input value is 0, therefore this can not be allowed.
* If tzcnt instruction is not supported, the cpu will itself execute bsf instead.
* Performance tzcnt/bsf is identical on Intel cpu, tzcnt is faster than bsf on AMD cpu.
*/
static __forceinline int __builtin_ctz(unsigned int value) {
Assert(value != 0, "Invalid input value: 0");
# if defined(X86_FEATURES) && !(_MSC_VER < 1700)
return (int)_tzcnt_u32(value);
# else
unsigned long trailing_zero;
_BitScanForward(&trailing_zero, value);
return (int)trailing_zero;
# endif
}
#define HAVE_BUILTIN_CTZ
#ifdef _M_AMD64
/* This is not a general purpose replacement for __builtin_ctzll. The function expects that value is != 0.
* Because of that assumption trailing_zero is not initialized and the return value is not checked.
*/
static __forceinline int __builtin_ctzll(unsigned long long value) {
Assert(value != 0, "Invalid input value: 0");
# if defined(X86_FEATURES) && !(_MSC_VER < 1700)
return (int)_tzcnt_u64(value);
# else
unsigned long trailing_zero;
_BitScanForward64(&trailing_zero, value);
return (int)trailing_zero;
# endif
}
#define HAVE_BUILTIN_CTZLL
#endif // Microsoft AMD64
#endif // Microsoft AMD64/IA64/x86/ARM/ARM64 test
#endif // _MSC_VER & !clang
/* Unfortunately GCC didn't support these things until version 10.
* Similarly, AppleClang didn't support them in Xcode 9.2 but did in 9.3.
*/
#ifdef __AVX2__
#include <immintrin.h>
#if (!defined(__clang__) && defined(__GNUC__) && __GNUC__ < 10) \
|| (defined(__apple_build_version__) && __apple_build_version__ < 9020039)
static inline __m256i _mm256_zextsi128_si256(__m128i a) {
__m128i r;
__asm__ volatile ("vmovdqa %1,%0" : "=x" (r) : "x" (a));
return _mm256_castsi128_si256(r);
}
#ifdef __AVX512F__
static inline __m512i _mm512_zextsi128_si512(__m128i a) {
__m128i r;
__asm__ volatile ("vmovdqa %1,%0" : "=x" (r) : "x" (a));
return _mm512_castsi128_si512(r);
}
#endif // __AVX512F__
#endif // gcc/AppleClang version test
#endif // __AVX2__
/* GCC <9 is missing some AVX512 intrinsics.
*/
#ifdef __AVX512F__
#if (!defined(__clang__) && defined(__GNUC__) && __GNUC__ < 9)
#include <immintrin.h>
#define PACK(c0, c1, c2, c3) (((int)(unsigned char)(c0) << 24) | ((int)(unsigned char)(c1) << 16) | \
((int)(unsigned char)(c2) << 8) | ((int)(unsigned char)(c3)))
static inline __m512i _mm512_set_epi8(char __q63, char __q62, char __q61, char __q60,
char __q59, char __q58, char __q57, char __q56,
char __q55, char __q54, char __q53, char __q52,
char __q51, char __q50, char __q49, char __q48,
char __q47, char __q46, char __q45, char __q44,
char __q43, char __q42, char __q41, char __q40,
char __q39, char __q38, char __q37, char __q36,
char __q35, char __q34, char __q33, char __q32,
char __q31, char __q30, char __q29, char __q28,
char __q27, char __q26, char __q25, char __q24,
char __q23, char __q22, char __q21, char __q20,
char __q19, char __q18, char __q17, char __q16,
char __q15, char __q14, char __q13, char __q12,
char __q11, char __q10, char __q09, char __q08,
char __q07, char __q06, char __q05, char __q04,
char __q03, char __q02, char __q01, char __q00) {
return _mm512_set_epi32(PACK(__q63, __q62, __q61, __q60), PACK(__q59, __q58, __q57, __q56),
PACK(__q55, __q54, __q53, __q52), PACK(__q51, __q50, __q49, __q48),
PACK(__q47, __q46, __q45, __q44), PACK(__q43, __q42, __q41, __q40),
PACK(__q39, __q38, __q37, __q36), PACK(__q35, __q34, __q33, __q32),
PACK(__q31, __q30, __q29, __q28), PACK(__q27, __q26, __q25, __q24),
PACK(__q23, __q22, __q21, __q20), PACK(__q19, __q18, __q17, __q16),
PACK(__q15, __q14, __q13, __q12), PACK(__q11, __q10, __q09, __q08),
PACK(__q07, __q06, __q05, __q04), PACK(__q03, __q02, __q01, __q00));
}
#undef PACK
#endif // gcc version test
#endif // __AVX512F__
/* Missing zero-extension AVX and AVX512 intrinsics.
* Fixed in Microsoft Visual Studio 2017 version 15.7
* https://developercommunity.visualstudio.com/t/missing-zero-extension-avx-and-avx512-intrinsics/175737
*/
#if defined(_MSC_VER) && _MSC_VER < 1914
#ifdef __AVX2__
static inline __m256i _mm256_zextsi128_si256(__m128i a) {
return _mm256_inserti128_si256(_mm256_setzero_si256(), a, 0);
}
#endif // __AVX2__
#ifdef __AVX512F__
static inline __m512i _mm512_zextsi128_si512(__m128i a) {
return _mm512_inserti32x4(_mm512_setzero_si512(), a, 0);
}
#endif // __AVX512F__
#endif // defined(_MSC_VER) && _MSC_VER < 1914
#endif // include guard FALLBACK_BUILTINS_H