Merge implementations from "missing SSE implementations" to NEON #855

nemequ · 2021-07-10T21:59:13Z

http://www.alfredklomp.com/programming/sse-intrinsics/ has a great list of implementations of "missing" SSE instructions.

Unlike SSE, NEON isn't missing a lot of this functionality, so we should steal that code and use it to implement parts of the NEON API. For example:

_mm_cmple_epu8 → vcleq_u8 (see 5906cc9)
_mm_cmpge_epu8 → vcgeq_u8
_mm_cmpgt_epu8 → vcgtq_u8
_mm_min_epu16 → vminq_u16
_mm_absdiff_epu8 → vabdq_u8
_mm_bswap_epi16 → vrev16q_u16/vrev16q_s16

We can also use the same techniques for a bunch of other functions which that page doesn't explicitly include (e.g., vcleq_u16/vcleq_u32/vcleq_u64 can all use the same technique as _mm_cmple_epu8, though 16/32-bit versions require SSE4.1 and 64-bit requires AVX-512VL).

Many of the same implementations could also be used in WASM (wasm_u8x16_le, wasm_u8x16_ge, wasm_u8x16_gt, wasm_u16x8_min, etc.).

There are also a few functions which are present in later versions of SSE, but can be emulated with earlier versions. We should make sure our implementations of SSE also have these versions, too.

As an example, 5906cc9 implements vcleq_u* using the code from _mm_cmple_epu8.

The text was updated successfully, but these errors were encountered:

aqrit · 2021-07-16T19:04:43Z

__m128i sse2_min_epu16 (__m128i a, __m128i b) {
	return _mm_sub_epi16(a, _mm_subs_epu16(a, b));
}

__m128i sse2_max_epu16 (__m128i a, __m128i b) {
	return _mm_add_epi16(_mm_subs_epu16(a, b), b);
}

These are missed optimizations in gcc, but clang has them.
AFAIK, both compilers automatically use a blend for the 32-bits versions.

aqrit · 2021-07-16T19:09:03Z

Here is another one:

__m128i sse2_subs_epi32(a, b) {
	__m128i t;
	t = _mm_xor_si128(_mm_set1_epi32(0x7FFFFFFF), _mm_cmpgt_epi32(b, a));
	a = _mm_sub_epi32(a, b);

	// return _mm_blendv_ps(a, t, a ^ t) 
	t = _mm_xor_si128(t, a);
	return _mm_xor_si128(a, _mm_and_si128(t, _mm_srai_epi32(t, 31)));
}

(edit) more:

__m128i sse2_absdiff_epi16 (__m128i x, __m128i y) {
	return _mm_sub_epi16(_mm_max_epi16(x, y), _mm_min_epi16(x, y));
}

__m128i sse2_absdiff_epi8 (__m128i a, __m128i b) {
	__m128i m = _mm_cmpgt_epi8(b, a);
	a = _mm_sub_epi8(a, b);
	a = _mm_add_epi8(a, m);
	return _mm_xor_si128(a, m);
}

nemequ · 2021-07-16T20:14:33Z

Nice, thanks. Those would be great for vminq_u16, vmaxq_u16, and vqsubq_s32 ☺.

There are tons of these floating around the internet, and I'd like to try to get as many as possible merged into SIMDe. Sometimes they are for missing functions, sometimes for emulating a newer instruction using an older extension (like the min/max functions you mentioned). Both are very useful to us.

@aqrit

These are mostly based on what we have for the scalar versions in simde-math.h, but @aqrit suggested basically the same thing in issue #855 for the the 32-bit signed version, so that implementation is partially based on that, too.

aqrit · 2021-07-21T22:49:48Z

Better lowering:

__m128i sse2_cmpgt_epu16 (__m128i x, __m128i y) {
	x = _mm_subs_epu16(x, y);
	return _mm_adds_epu16(x, _mm_sub_epi16(_mm_setzero_si128(), x));
}

aqrit · 2021-07-24T13:56:15Z

__m128i sse2_cmpgt_epu32(__m128i x, __m128i y) {
    return _mm_xor_si128(_mm_cmpgt_epi32(x, y),
                         _mm_srai_epi32(_mm_xor_si128(x, y), 31));
}

Might help with min_u32.

aqrit · 2021-07-24T17:21:46Z

collection so far:

__m128i sse2_cmpgt_epu8 (__m128i a, __m128i b) {
    a = _mm_subs_epu8(a, b);
    return _mm_adds_epu8(a, _mm_sub_epi8(_mm_setzero_si128(), a));
}
__m128i sse2_cmplt_epu8 (__m128i a, __m128i b) {
    return sse2_cmpgt_epu8(b, a);
}
__m128i sse2_cmpge_epu8 (__m128i a, __m128i b) {
    return _mm_cmpeq_epi8(_mm_min_epu8(a, b), b);
}
__m128i sse2_cmple_epu8 (__m128i a, __m128i b) {
    return _mm_cmpeq_epi8(_mm_max_epu8(a, b), b);
}


__m128i sse2_cmpgt_epu16 (__m128i a, __m128i b) {
    a = _mm_subs_epu16(a, b);
    return _mm_adds_epu16(a, _mm_sub_epi16(_mm_setzero_si128(), a));
}
__m128i sse2_cmplt_epu16 (__m128i a, __m128i b) {
    return sse2_cmpgt_epu16(b, a);
}
__m128i sse2_cmpge_epu16 (__m128i a, __m128i b) {
    return _mm_cmpeq_epi16(_mm_subs_epu16(b, a), _mm_setzero_si128());
}
__m128i sse2_cmple_epu16 (__m128i a, __m128i b) {
    return _mm_cmpeq_epi16(_mm_subs_epu16(a, b), _mm_setzero_si128());
}


__m128i sse2_cmpgt_epu32(__m128i a, __m128i b) {
    return _mm_xor_si128(_mm_cmpgt_epi32(a, b),
                         _mm_srai_epi32(_mm_xor_si128(a, b), 31));
}
__m128i sse2_cmplt_epu32(__m128i a, __m128i b) {
    return sse2_cmpgt_epu32(b, a);
}
__m128i sse2_cmpge_epu32(__m128i a, __m128i b) {
    ??
}
__m128i sse2_cmple_epu32(__m128i a, __m128i b) {
    ??
}


__m128i sse2_min_epi8 (__m128i a, __m128i b) {
    ??
}
__m128i sse2_min_epu8 (__m128i a, __m128i b) {
    return _mm_min_epu8(a, b);
}
__m128i sse2_min_epi16 (__m128i a, __m128i b) {
    return _mm_min_epi16(a, b);
}
__m128i sse2_min_epu16 (__m128i a, __m128i b) {
    return _mm_sub_epi16(a, _mm_subs_epu16(a, b));
}
__m128i sse2_min_epi32 (__m128i a, __m128i b) {
    __m128i m = _mm_cmpgt_epi32(b, a);
    return _mm_or_si128(_mm_and_si128(a, m), _mm_andnot_si128(m, b));
}
__m128i sse2_min_epu32 (__m128i a, __m128i b) {
    __m128i m = sse2_cmpgt_epu32(b, a);
    return _mm_or_si128(_mm_and_si128(a, m), _mm_andnot_si128(m, b));
}


__m128i sse2_max_epi8 (__m128i a, __m128i b) {
    ??
}
__m128i sse2_max_epu8 (__m128i a, __m128i b) {
    return _mm_max_epu8(a, b);
}
__m128i sse2_max_epi16 (__m128i a, __m128i b) {
    return _mm_max_epi16(a, b);
}
__m128i sse2_max_epu16 (__m128i a, __m128i b) {
    return _mm_add_epi16(_mm_subs_epu16(a, b), b);
}
__m128i sse2_max_epi32 (__m128i a, __m128i b) {
    __m128i m = _mm_cmpgt_epi32(a, b);
    return _mm_or_si128(_mm_and_si128(a, m), _mm_andnot_si128(m, b));
}
__m128i sse2_max_epu32 (__m128i a, __m128i b) {
    __m128i m = sse2_cmpgt_epu32(a, b);
    return _mm_or_si128(_mm_and_si128(a, m), _mm_andnot_si128(m, b));
}


__m128i sse2_absdiff_epi8 (__m128i a, __m128i b) {
    __m128i m = _mm_cmpgt_epi8(b, a);
    a = _mm_sub_epi8(a, b);
    a = _mm_add_epi8(a, m);
    return _mm_xor_si128(a, m);
}
__m128i sse2_absdiff_epi16 (__m128i x, __m128i y) {
    return _mm_sub_epi16(_mm_max_epi16(x, y), _mm_min_epi16(x, y));
}
__m128i sse2_absdiff_epi32 (__m128i a, __m128i b) {
    __m128i m = _mm_cmpgt_epi32(b, a);
    a = _mm_sub_epi32(a, b);
    a = _mm_add_epi32(a, m);
    return _mm_xor_si128(a, m);
}


__m128i sse2_absdiff_epu8 (__m128i x, __m128i y) {
    return _mm_or_si128(_mm_subs_epu8(x, y), _mm_subs_epu8(y, x));
}
__m128i sse2_absdiff_epu16 (__m128i x, __m128i y) {
    return _mm_or_si128(_mm_subs_epu16(x, y), _mm_subs_epu16(y, x));
}
__m128i sse2_absdiff_epu32 (__m128i a, __m128i b) {
    ??
}


__m128i sse2_subs_epu32 (__m128i a, __m128i b) {
    __m128i m = sse2_cmpgt_epu32(a, b);
    return _mm_and_si128(m, _mm_sub_epi32(a, b));
}
__m128i sse2_subs_epi32(a, b) {
    __m128i t;
    t = _mm_xor_si128(_mm_set1_epi32(0x7FFFFFFF), _mm_cmpgt_epi32(b, a));
    a = _mm_sub_epi32(a, b);
    t = _mm_xor_si128(t, a);
    return _mm_xor_si128(a, _mm_and_si128(t, _mm_srai_epi32(t, 31)));
}


__m128i sse2_adds_epu32 (__m128i a, __m128i b) {
    __m128i t = _mm_add_epi32(a, b);
    return _mm_or_si128(t, sse2_cmpgt_epu32(b, t));
}
__m128i sse2_adds_epi32 (__m128i a, __m128i b) {
    // https://stackoverflow.com/q/29498824
}

Avoids generating/loading constants which may not be desirable.

cc @aklomp

See #855 (comment)

@aqrit

These are based on implementations suggested by @aqrit at #855 (comment) I've just extended them to other types and added some similar implementations for POWER and WASM SIMD128.

nemequ · 2021-07-24T18:21:54Z

return _mm_xor_si128(_mm_cmpgt_epi32(x, y),
                     _mm_srai_epi32(_mm_xor_si128(x, y), 31));

https://godbolt.org/z/T73MbPEnh

I agree, the throughput isn't quite as good, but the latency on that mov is painful, plus the memory to store the data…

I'll go through your last comment soon, but I think I've got most of them in place (though not merged yet). Thanks for putting them together ☺

See #855 (comment)

@aqrit

These are based on implementations suggested by @aqrit at #855 (comment) I've just extended them to other types and added some similar implementations for POWER and WASM SIMD128.

@aqrit

These are based on @aqrit's suggestions at #855 (comment) and #855 (comment)

See #855 (comment)

@aqrit

These are based on implementations suggested by @aqrit at #855 (comment) I've just extended them to other types and added some similar implementations for POWER and WASM SIMD128.

@aqrit

These are based on @aqrit's suggestions at #855 (comment) and #855 (comment)

See #855 (comment)

@aqrit

These are based on implementations suggested by @aqrit at #855 (comment) I've just extended them to other types and added some similar implementations for POWER and WASM SIMD128.

@aqrit

These are based on @aqrit's suggestions at #855 (comment) and #855 (comment)

See #855 (comment)

@aqrit

These are based on implementations suggested by @aqrit at #855 (comment) I've just extended them to other types and added some similar implementations for POWER and WASM SIMD128.

@aqrit

These are based on @aqrit's suggestions at #855 (comment) and #855 (comment)

See simd-everywhere/simde#855 (comment)

@aqrit

These are based on implementations suggested by @aqrit at simd-everywhere/simde#855 (comment) I've just extended them to other types and added some similar implementations for POWER and WASM SIMD128.

@aqrit

These are based on @aqrit's suggestions at simd-everywhere/simde#855 (comment) and simd-everywhere/simde#855 (comment)

See simd-everywhere#855 (comment)

@aqrit

These are based on implementations suggested by @aqrit at simd-everywhere#855 (comment) I've just extended them to other types and added some similar implementations for POWER and WASM SIMD128.

@aqrit

These are based on @aqrit's suggestions at simd-everywhere#855 (comment) and simd-everywhere#855 (comment)

aklomp · 2022-05-20T21:31:07Z

FWIW, my "missing SSE intrinsics" project is now canonically hosted at https://github.com/aklomp/missing-sse-intrinsics.

nemequ added good first issue This is perfect if you're new to the project and looking to help. accelerated-implementation An implementation of one ISA extension with another (e.g., implementing SSE using NEON) labels Jul 10, 2021

aqrit referenced this issue Jul 23, 2021

simd128: add improved min implementations on several architectures

9922857

nemequ added a commit that referenced this issue Jul 24, 2021

Add @aqrit's SSE2 min/max implementations

c73a2a1

See #855 (comment)

nemequ added a commit that referenced this issue Jul 24, 2021

neon/abd: add much better implementations

edad24d

These are based on implementations suggested by @aqrit at #855 (comment) I've just extended them to other types and added some similar implementations for POWER and WASM SIMD128.

nemequ added a commit that referenced this issue Jul 24, 2021

Add @aqrit's SSE2 min/max implementations

7eab04c

See #855 (comment)

nemequ added a commit that referenced this issue Jul 24, 2021

neon/abd: add much better implementations

ce913de

These are based on implementations suggested by @aqrit at #855 (comment) I've just extended them to other types and added some similar implementations for POWER and WASM SIMD128.

nemequ added a commit that referenced this issue Jul 24, 2021

neon/cgt, simd128: improve some unsigned comparisons on x86

a0e89fd

These are based on @aqrit's suggestions at #855 (comment) and #855 (comment)

nemequ added a commit that referenced this issue Jul 24, 2021

Add @aqrit's SSE2 min/max implementations

618a575

See #855 (comment)

nemequ added a commit that referenced this issue Jul 24, 2021

neon/abd: add much better implementations

b4cebc2

These are based on implementations suggested by @aqrit at #855 (comment) I've just extended them to other types and added some similar implementations for POWER and WASM SIMD128.

nemequ added a commit that referenced this issue Jul 24, 2021

neon/cgt, simd128: improve some unsigned comparisons on x86

9523965

These are based on @aqrit's suggestions at #855 (comment) and #855 (comment)

nemequ added a commit that referenced this issue Jul 24, 2021

Add @aqrit's SSE2 min/max implementations

2e20a50

See #855 (comment)

nemequ added a commit that referenced this issue Jul 24, 2021

neon/abd: add much better implementations

d1146bd

These are based on implementations suggested by @aqrit at #855 (comment) I've just extended them to other types and added some similar implementations for POWER and WASM SIMD128.

nemequ added a commit that referenced this issue Jul 24, 2021

neon/cgt, simd128: improve some unsigned comparisons on x86

e8d2c1d

These are based on @aqrit's suggestions at #855 (comment) and #855 (comment)

nemequ added a commit that referenced this issue Jul 24, 2021

Add @aqrit's SSE2 min/max implementations

d90e835

See #855 (comment)

nemequ added a commit that referenced this issue Jul 24, 2021

neon/abd: add much better implementations

c3ddbbe

These are based on implementations suggested by @aqrit at #855 (comment) I've just extended them to other types and added some similar implementations for POWER and WASM SIMD128.

nemequ added a commit that referenced this issue Jul 24, 2021

neon/cgt, simd128: improve some unsigned comparisons on x86

ae6702a

These are based on @aqrit's suggestions at #855 (comment) and #855 (comment)

nemequ added a commit to simd-everywhere/simde-no-tests that referenced this issue Jul 25, 2021

Add @aqrit's SSE2 min/max implementations

9b1974d

See simd-everywhere/simde#855 (comment)

nemequ added a commit to simd-everywhere/simde-no-tests that referenced this issue Jul 25, 2021

neon/cgt, simd128: improve some unsigned comparisons on x86

7f3a52d

These are based on @aqrit's suggestions at simd-everywhere/simde#855 (comment) and simd-everywhere/simde#855 (comment)

tommyvct pushed a commit to tommyvct/simde that referenced this issue Jul 26, 2021

Add @aqrit's SSE2 min/max implementations

bddbb84

See simd-everywhere#855 (comment)

tommyvct pushed a commit to tommyvct/simde that referenced this issue Jul 26, 2021

neon/cgt, simd128: improve some unsigned comparisons on x86

7c78a36

These are based on @aqrit's suggestions at simd-everywhere#855 (comment) and simd-everywhere#855 (comment)

Merge implementations from "missing SSE implementations" to NEON #855

Merge implementations from "missing SSE implementations" to NEON #855

nemequ commented Jul 10, 2021

aqrit commented Jul 16, 2021

aqrit commented Jul 16, 2021 •

edited

nemequ commented Jul 16, 2021

aqrit commented Jul 21, 2021

aqrit commented Jul 24, 2021

aqrit commented Jul 24, 2021

nemequ commented Jul 24, 2021

aklomp commented May 20, 2022

Merge implementations from "missing SSE implementations" to NEON #855

Merge implementations from "missing SSE implementations" to NEON #855

Comments

nemequ commented Jul 10, 2021

aqrit commented Jul 16, 2021

aqrit commented Jul 16, 2021 • edited

nemequ commented Jul 16, 2021

aqrit commented Jul 21, 2021

aqrit commented Jul 24, 2021

aqrit commented Jul 24, 2021

nemequ commented Jul 24, 2021

aklomp commented May 20, 2022

aqrit commented Jul 16, 2021 •

edited