-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request lh3#359 from jmarshall/neon
Add ARM Neon and scalar implementations of SIMD functions
- Loading branch information
Showing
4 changed files
with
194 additions
and
6 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
#ifndef NEON_SSE_H | ||
#define NEON_SSE_H | ||
|
||
#include <arm_neon.h> | ||
|
||
typedef uint8x16_t __m128i; | ||
|
||
static inline __m128i _mm_load_si128(const __m128i *ptr) { return vld1q_u8((const uint8_t *) ptr); } | ||
static inline __m128i _mm_set1_epi32(int n) { return vreinterpretq_u8_s32(vdupq_n_s32(n)); } | ||
static inline void _mm_store_si128(__m128i *ptr, __m128i a) { vst1q_u8((uint8_t *) ptr, a); } | ||
|
||
static inline __m128i _mm_adds_epu8(__m128i a, __m128i b) { return vqaddq_u8(a, b); } | ||
static inline __m128i _mm_max_epu8(__m128i a, __m128i b) { return vmaxq_u8(a, b); } | ||
static inline __m128i _mm_set1_epi8(int8_t n) { return vreinterpretq_u8_s8(vdupq_n_s8(n)); } | ||
static inline __m128i _mm_subs_epu8(__m128i a, __m128i b) { return vqsubq_u8(a, b); } | ||
|
||
#define M128I(a) vreinterpretq_u8_s16((a)) | ||
#define UM128I(a) vreinterpretq_u8_u16((a)) | ||
#define S16(a) vreinterpretq_s16_u8((a)) | ||
#define U16(a) vreinterpretq_u16_u8((a)) | ||
|
||
static inline __m128i _mm_adds_epi16(__m128i a, __m128i b) { return M128I(vqaddq_s16(S16(a), S16(b))); } | ||
static inline __m128i _mm_cmpgt_epi16(__m128i a, __m128i b) { return UM128I(vcgtq_s16(S16(a), S16(b))); } | ||
static inline __m128i _mm_max_epi16(__m128i a, __m128i b) { return M128I(vmaxq_s16(S16(a), S16(b))); } | ||
static inline __m128i _mm_set1_epi16(int16_t n) { return vreinterpretq_u8_s16(vdupq_n_s16(n)); } | ||
static inline __m128i _mm_subs_epu16(__m128i a, __m128i b) { return UM128I(vqsubq_u16(U16(a), U16(b))); } | ||
|
||
#undef M128I | ||
#undef UM128I | ||
#undef S16 | ||
#undef U16 | ||
|
||
#endif |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,119 @@ | ||
#ifndef SCALAR_SSE_H | ||
#define SCALAR_SSE_H | ||
|
||
#include <assert.h> | ||
#include <stdint.h> | ||
#include <string.h> | ||
|
||
typedef union m128i { | ||
uint8_t u8[16]; | ||
int16_t i16[8]; | ||
} __m128i; | ||
|
||
static inline __m128i _mm_set1_epi32(int32_t n) { | ||
assert(n >= 0 && n <= 255); | ||
__m128i r; memset(&r, n, sizeof r); return r; | ||
} | ||
|
||
static inline __m128i _mm_load_si128(const __m128i *ptr) { __m128i r; memcpy(&r, ptr, sizeof r); return r; } | ||
static inline void _mm_store_si128(__m128i *ptr, __m128i a) { memcpy(ptr, &a, sizeof a); } | ||
|
||
static inline int m128i_allzero(__m128i a) { | ||
static const char zero[] = "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"; | ||
return memcmp(&a, zero, sizeof a) == 0; | ||
} | ||
|
||
static inline __m128i _mm_slli_si128(__m128i a, int n) { | ||
int i; | ||
memmove(&a.u8[n], &a.u8[0], 16 - n); | ||
for (i = 0; i < n; i++) a.u8[i] = 0; | ||
return a; | ||
} | ||
|
||
static inline __m128i _mm_adds_epu8(__m128i a, __m128i b) { | ||
int i; | ||
for (i = 0; i < 16; i++) { | ||
uint16_t aa = a.u8[i]; | ||
aa += b.u8[i]; | ||
a.u8[i] = (aa < 256)? aa : 255; | ||
} | ||
return a; | ||
} | ||
|
||
static inline __m128i _mm_max_epu8(__m128i a, __m128i b) { | ||
int i; | ||
for (i = 0; i < 16; i++) | ||
if (a.u8[i] < b.u8[i]) a.u8[i] = b.u8[i]; | ||
return a; | ||
} | ||
|
||
static inline uint8_t m128i_max_u8(__m128i a) { | ||
uint8_t max = 0; | ||
int i; | ||
for (i = 0; i < 16; i++) | ||
if (max < a.u8[i]) max = a.u8[i]; | ||
return max; | ||
} | ||
|
||
static inline __m128i _mm_set1_epi8(int8_t n) { __m128i r; memset(&r, n, sizeof r); return r; } | ||
|
||
static inline __m128i _mm_subs_epu8(__m128i a, __m128i b) { | ||
int i; | ||
for (i = 0; i < 16; i++) { | ||
int16_t aa = a.u8[i]; | ||
aa -= b.u8[i]; | ||
a.u8[i] = (aa >= 0)? aa : 0; | ||
} | ||
return a; | ||
} | ||
|
||
static inline __m128i _mm_adds_epi16(__m128i a, __m128i b) { | ||
int i; | ||
for (i = 0; i < 8; i++) { | ||
int32_t aa = a.i16[i]; | ||
aa += b.i16[i]; | ||
a.i16[i] = (aa < 32768)? aa : 32767; | ||
} | ||
return a; | ||
} | ||
|
||
static inline __m128i _mm_cmpgt_epi16(__m128i a, __m128i b) { | ||
int i; | ||
for (i = 0; i < 8; i++) | ||
a.i16[i] = (a.i16[i] > b.i16[i])? 0xffff : 0x0000; | ||
return a; | ||
} | ||
|
||
static inline __m128i _mm_max_epi16(__m128i a, __m128i b) { | ||
int i; | ||
for (i = 0; i < 8; i++) | ||
if (a.i16[i] < b.i16[i]) a.i16[i] = b.i16[i]; | ||
return a; | ||
} | ||
|
||
static inline __m128i _mm_set1_epi16(int16_t n) { | ||
__m128i r; | ||
r.i16[0] = r.i16[1] = r.i16[2] = r.i16[3] = | ||
r.i16[4] = r.i16[5] = r.i16[6] = r.i16[7] = n; | ||
return r; | ||
} | ||
|
||
static inline int16_t m128i_max_s16(__m128i a) { | ||
int16_t max = -32768; | ||
int i; | ||
for (i = 0; i < 8; i++) | ||
if (max < a.i16[i]) max = a.i16[i]; | ||
return max; | ||
} | ||
|
||
static inline __m128i _mm_subs_epu16(__m128i a, __m128i b) { | ||
int i; | ||
for (i = 0; i < 8; i++) { | ||
int32_t aa = a.i16[i]; | ||
aa -= b.i16[i]; | ||
a.i16[i] = (aa >= 0)? aa : 0; | ||
} | ||
return a; | ||
} | ||
|
||
#endif |