Skip to content

Commit 6fb7ef4

Browse files
authored
Optimize _mm_sign_epi* intrinsics (#12417)
1 parent fd1603c commit 6fb7ef4

File tree

2 files changed

+15
-12
lines changed

2 files changed

+15
-12
lines changed

site/source/docs/porting/simd.rst

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -809,11 +809,11 @@ The following table highlights the availability and expected performance of diff
809809
* - _mm_shuffle_epi8
810810
- ⚠️ emulated with a SIMD swizzle+and+const
811811
* - _mm_sign_epi8
812-
- ⚠️ emulated with a SIMD complex shuffle+cmp+xor+andnot
812+
- ⚠️ emulated with SIMD two cmp+two logical+add
813813
* - _mm_sign_epi16
814-
- ⚠️ emulated with a SIMD shr+cmp+xor+andnot
814+
- ⚠️ emulated with SIMD two cmp+two logical+add
815815
* - _mm_sign_epi32
816-
- ⚠️ emulated with a SIMD shr+cmp+xor+andnot
816+
- ⚠️ emulated with SIMD two cmp+two logical+add
817817

818818
⚫ The SSSE3 functions that deal with 64-bit wide MMX registers are not available:
819819
- _mm_abs_pi8, _mm_abs_pi16, _mm_abs_pi32, _mm_alignr_pi8, _mm_hadd_pi16, _mm_hadd_pi32, _mm_hadds_pi16, _mm_hsub_pi16, _mm_hsub_pi32, _mm_hsubs_pi16, _mm_maddubs_pi16, _mm_mulhrs_pi16, _mm_shuffle_pi8, _mm_sign_pi8, _mm_sign_pi16 and _mm_sign_pi32

system/include/SSE/tmmintrin.h

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -147,25 +147,28 @@ _mm_shuffle_epi8(__m128i __a, __m128i __b)
147147
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
148148
_mm_sign_epi8(__m128i __a, __m128i __b)
149149
{
150-
__m128i __mask = (__m128i)wasm_i8x16_shr((v128_t)__b, 7);
151-
__m128i __zeromask = _mm_cmpeq_epi8(__b, _mm_setzero_si128());
152-
return _mm_andnot_si128(__zeromask, _mm_xor_si128(_mm_add_epi8(__a, __mask), __mask));
150+
const __m128i __zero = _mm_setzero_si128();
151+
__a = _mm_andnot_si128(_mm_cmpeq_epi8(__b, __zero), __a);
152+
const __m128i __mask = _mm_cmpgt_epi8(__zero, __b);
153+
return _mm_xor_si128(_mm_add_epi8(__a, __mask), __mask);
153154
}
154155

155156
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
156157
_mm_sign_epi16(__m128i __a, __m128i __b)
157158
{
158-
__m128i __mask = _mm_srai_epi16(__b, 15);
159-
__m128i __zeromask = _mm_cmpeq_epi16(__b, _mm_setzero_si128());
160-
return _mm_andnot_si128(__zeromask, _mm_xor_si128(_mm_add_epi16(__a, __mask), __mask));
159+
const __m128i __zero = _mm_setzero_si128();
160+
__a = _mm_andnot_si128(_mm_cmpeq_epi16(__b, __zero), __a);
161+
const __m128i __mask = _mm_cmpgt_epi16(__zero, __b);
162+
return _mm_xor_si128(_mm_add_epi16(__a, __mask), __mask);
161163
}
162164

163165
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
164166
_mm_sign_epi32(__m128i __a, __m128i __b)
165167
{
166-
__m128i __mask = _mm_srai_epi32(__b, 31);
167-
__m128i __zeromask = _mm_cmpeq_epi32(__b, _mm_setzero_si128());
168-
return _mm_andnot_si128(__zeromask, _mm_xor_si128(_mm_add_epi32(__a, __mask), __mask));
168+
const __m128i __zero = _mm_setzero_si128();
169+
__a = _mm_andnot_si128(_mm_cmpeq_epi32(__b, __zero), __a);
170+
const __m128i __mask = _mm_cmpgt_epi32(__zero, __b);
171+
return _mm_xor_si128(_mm_add_epi32(__a, __mask), __mask);
169172
}
170173

171174
// Unavailable functions:

0 commit comments

Comments
 (0)