SSE2 intrinsics - comparing unsigned integers
One way of implementing compares for unsigned 8 bit vectors is to exploit _mm_max_epu8
, which returns the maximum of unsigned 8 bit int elements. You can compare for equality the (unsigned) maximum value of two elements with one of the source elements and then return the appropriate result. This translates to 2 instructions for >=
or <=
, and 3 instructions for >
or <
.
Example code:
#include <stdio.h>
#include <emmintrin.h> // SSE2
#define _mm_cmpge_epu8(a, b) \
_mm_cmpeq_epi8(_mm_max_epu8(a, b), a)
#define _mm_cmple_epu8(a, b) _mm_cmpge_epu8(b, a)
#define _mm_cmpgt_epu8(a, b) \
_mm_xor_si128(_mm_cmple_epu8(a, b), _mm_set1_epi8(-1))
#define _mm_cmplt_epu8(a, b) _mm_cmpgt_epu8(b, a)
int main(void)
{
__m128i va = _mm_setr_epi8(0, 0, 1, 1, 1, 127, 127, 127, 128, 128, 128, 254, 254, 254, 255, 255);
__m128i vb = _mm_setr_epi8(0, 255, 0, 1, 255, 0, 127, 255, 0, 128, 255, 0, 254, 255, 0, 255);
__m128i v_ge = _mm_cmpge_epu8(va, vb);
__m128i v_le = _mm_cmple_epu8(va, vb);
__m128i v_gt = _mm_cmpgt_epu8(va, vb);
__m128i v_lt = _mm_cmplt_epu8(va, vb);
printf("va = %4vhhu\n", va);
printf("vb = %4vhhu\n", vb);
printf("v_ge = %4vhhu\n", v_ge);
printf("v_le = %4vhhu\n", v_le);
printf("v_gt = %4vhhu\n", v_gt);
printf("v_lt = %4vhhu\n", v_lt);
return 0;
}
Compile and run:
$ gcc -Wall _mm_cmplt_epu8.c && ./a.out
va = 0 0 1 1 1 127 127 127 128 128 128 254 254 254 255 255
vb = 0 255 0 1 255 0 127 255 0 128 255 0 254 255 0 255
v_ge = 255 0 255 255 0 255 255 0 255 255 0 255 255 0 255 255
v_le = 255 255 0 255 255 0 255 255 0 255 255 0 255 255 0 255
v_gt = 0 0 255 0 0 255 0 0 255 0 0 255 0 0 255 0
v_lt = 0 255 0 0 255 0 0 255 0 0 255 0 0 255 0 0
SSE2 intrinsics - find max of two unsigned short vectors
One (somewhat inefficient) way of doing it is to offset the input values by 0x8000 and then add this offset back to the result, e.g.:
#ifndef __SSE4_1__
inline __m128i _mm_max_epu16(const __m128i v0, const __m128i v1)
{
return _mm_add_epi16(
_mm_max_epi16(
_mm_sub_epi16(v0, _mm_set1_epi16(0x8000)),
_mm_sub_epi16(v1, _mm_set1_epi16(0x8000))),
_mm_set1_epi16(0x8000));
}
#endif
With gcc or clang this generates one load instruction for the constant and four arithmetic instructions.
Note that you can use
_mm_xor_si128
in place of _mm_add_epi16
/_mm_sub_epi16
, which is arguably clearer in intent, and may give better performance on your target architecture:#ifndef __SSE4_1__
inline __m128i _mm_max_epu16(const __m128i v0, const __m128i v1)
{
return _mm_xor_si128(
_mm_max_epi16(
_mm_xor_si128(v0, _mm_set1_epi16(0x8000)),
_mm_xor_si128(v1, _mm_set1_epi16(0x8000))),
_mm_set1_epi16(0x8000));
}
#endif
SSE Compare Packed Unsigned Bytes
It's not possible to do a greather than compare in packed unsigned bytes, I've unpacked the bytes into words (as they were possitive it's like a conversion from unsigned to signed and a extension from byte to word) and compared them using PCMPGTB.
SSE2 intrinsics - comparing 2 __m128i's containing 4 int32's each to see how many are equal
You can AND the compare result with a vector of ones to create a vector of zeros and ones. Then use a horizontal add operation to count the ones. Here are some possibilities.
#include "stdio.h"
#include "stdint.h"
#include "intrin.h"
//----------------------------------------------------------------------------
// non-SSE method (reference for result check)
static int method0 (__m128i value)
{
int index, total = 0;
uint32_t *buffer = (void *) &value;
for (index = 0; index < 4; index++)
total += buffer [index] == 0xFFFFFFFF;
return total;
}
//----------------------------------------------------------------------------
//
// horizontalAddBytes - return integer total of all 16 bytes in xmm argument
//
static int horizontalAddBytes (__m128i byteArray)
{
__m128i total;
const __m128i zero = _mm_setzero_si128 ();
total = _mm_sad_epu8 (byteArray, zero);
return _mm_cvtsi128_si64 (_mm_add_epi32 (total, _mm_shuffle_epi32 (total, 0xAA)));
}
//----------------------------------------------------------------------------
// requires SSE2
static int method1 (__m128i value)
{
return horizontalAddBytes (_mm_srli_epi32 (value, 31));
}
//----------------------------------------------------------------------------
// requires SSE3
static int method2 (__m128i value)
{
__m128 count;
const __m128 mask = _mm_set1_ps (1);
count = _mm_and_ps (_mm_castsi128_ps (value), mask);
count = _mm_hadd_ps (count, count);
count = _mm_hadd_ps (count, count);
return _mm_cvtss_si32 (count);
}
//----------------------------------------------------------------------------
// requires SSSE3
static int method3 (__m128i value)
{
__m128i count;
count = _mm_srli_epi32 (value, 31);
count = _mm_hadd_epi32 (count, count);
count = _mm_hadd_epi32 (count, count);
return _mm_cvtsi128_si32 (count);
}
//----------------------------------------------------------------------------
static void createTestData (uint32_t *data, int mask)
{
int index;
for (index = 0; index < 4; index++)
data [index * 4] = (mask & (1 << index)) != 0;
}
//----------------------------------------------------------------------------
int main (void)
{
int index1, index2, expected, result1, result2, result3;
uint32_t source [16];
uint32_t reference [16];
for (index1 = 0; index1 < 16; index1++)
for (index2 = 0; index2 < 16; index2++)
{
__m128i first_4_int32s, second_4_int32s, result;
createTestData (source, index1);
createTestData (reference, index2);
// Load the 4 source int32's (they are actually 4 int32s apart)
first_4_int32s = _mm_set_epi32(*(source + 12), *(source + 8), *(source + 4), *(source));
// Load the 4 source int32's (also actually 4 int32s apart)
second_4_int32s = _mm_set_epi32(*(reference + 12), *(reference + 8), *(reference + 4), *(reference));
// Compare the int32's
result = _mm_cmpeq_epi32(first_4_int32s, second_4_int32s);
expected = method0 (result);
result1 = method1 (result);
result2 = method2 (result);
result3 = method3 (result);
if (result1 != expected) printf ("method1, index %d,%d expected %d, actual %d\n", index1, index2, expected, result1);
if (result2 != expected) printf ("method2, index %d,%d expected %d, actual %d\n", index1, index2, expected, result2);
if (result3 != expected) printf ("method3, index %d,%d expected %d, actual %d\n", index1, index2, expected, result3);
}
return 0;
}
//----------------------------------------------------------------------------
SSE intrinsics: Convert 32-bit floats to UNSIGNED 8-bit integers
There is no direct conversion from float to byte, _mm_cvtps_pi8
is a composite. _mm_cvtps_pi16
is also a composite, and in this case it's just doing some pointless stuff that you undo with the shuffle. They also return annoying __m64
's.
Anyway, we can convert to dwords (signed, but that doesn't matter), and then pack (unsigned) or shuffle them into bytes. _mm_shuffle_(e)pi8
generates a pshufb
, Core2 45nm and AMD processors aren't too fond of it and you have to get a mask from somewhere.
Either way you don't have to round to the nearest integer first, the convert will do that. At least, if you haven't messed with the rounding mode.
Using packs 1: (not tested) -- probably not useful, packusdw
already outputs unsigned words but then packuswb
wants signed words again. Kept around because it is referred to elsewhere.
cvtps2dq xmm0, xmm0
packusdw xmm0, xmm0 ; unsafe: saturates to a different range than packuswb accepts
packuswb xmm0, xmm0
movd somewhere, xmm0
Using different shuffles:
cvtps2dq xmm0, xmm0
packssdw xmm0, xmm0 ; correct: signed saturation on first step to feed packuswb
packuswb xmm0, xmm0
movd somewhere, xmm0
Using shuffle: (not tested)
cvtps2dq xmm0, xmm0
pshufb xmm0, [shufmask]
movd somewhere, xmm0
shufmask: db 0, 4, 8, 12, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h
SSE2 integer overflow checking
Here is a somewhat more efficient version of @hirschhornsalz's sum_and_overflow
function:
void sum_and_overflow(__v4si a, __v4si b, __v4si& sum, __v4si& overflow)
{
__v4si sa, sb;
sum = _mm_add_epi32(a, b); // calculate sum
sa = _mm_xor_si128(sum, a); // compare sign of sum with sign of a
sb = _mm_xor_si128(sum, b); // compare sign of sum with sign of b
overflow = _mm_and_si128(sa, sb); // get overflow in sign bit
overflow = _mm_srai_epi32(overflow, 31); // convert to SIMD boolean (-1 == TRUE, 0 == FALSE)
}
It uses an expression for overflow detection from Hacker's Delight page 27:
sum = a + b;
overflow = (sum ^ a) & (sum ^ b); // overflow flag in sign bit
Note that the overflow vector will contain the more conventional SIMD boolean values of -1 for TRUE (overflow) and 0 for FALSE (no overflow). If you only need the overflow in the sign bit and the other bits are "don't care" then you can omit the last line of the function, reducing the number of SIMD instructions from 5 to 4.
NB: this solution, as well as the previous solution on which it is based are for signed integer values. A solution for unsigned values will require a slightly different approach (see @Stephen Canon's answer).
What is the most efficient way to do unsigned 64 bit comparison on SSE2?
Translated from Hacker's Delight:
static
__m128i sse2_cmpgt_epu64(__m128i a, __m128i b) {
__m128i r = _mm_andnot_si128(_mm_xor_si128(b, a), _mm_sub_epi64(b, a));
r = _mm_or_si128(r, _mm_andnot_si128(b, a));
return _mm_shuffle_epi32(_mm_srai_epi32(r, 31), _MM_SHUFFLE(3,3,1,1));
}
Concept: If mixed "signs" (unsigned MSBs) then return a
else return b - a
(MSB(a) ^ MSB(b)) ? a : b - a; // result in MSB
This makes sense:
- if
a
's MSB is set andb
's isn't,a
is unsigned above (so MSB(a) is our result) - if
b
's MSB is set anda
's isn't,a
is unsigned below (so MSB(a) is our result) - if their MSBs are the same, they values are in the same half of the unsigned range so
b-a
is effectively a 63-bit subtraction. The MSBs will cancel and the MSB ofb-a
will be equal to the "borrow" output which tells you ifa
is strictly aboveb
. (Like the CF flag for scalarsub
.jb
isjc
). So MSB(b-a) is our result.
Note that the SIMD andnot/and/or is a bit-blend, but we only care about the MSB. We broadcast it with srai -> shuffle_epi32, discarding the garbage in lower bits. (Or with SSE3, movshdup
as described in @Soont's answer.)
It differs from signed compare:
(MSB(a) ^ MSB(b)) ? ~a : b - a; // result in MSB
If signs are mixed then the sign of ~a
is also the sign of b
, of course.
Related Topics
Partial Specialization of Function Templates
How to Dynamically Allocate Arrays in C++
C++ Dynamic Array Initialization with Declaration
Boost.Python: Wrap Functions to Release the Gil
Why Do We Even Need the "Delete[]" Operator
Are Non Dereferenced Iterators Past the "One Past-The-End" Iterator of an Array Undefined Behavior
With C++, I Get Pointer with 0Xcdcdcdcd When Creating a Class - What Is Happening
Where Does the _1 Symbol Come from When Using Llvm's Libc++
How to Read from Memory Just Like from a File Using iOStream
C++ Class Member Function Callback
How to Benchmark Boost Spirit Parser
Get Private Data Members for Non Intrusive Boost Serialization C++
How to Read a File at Compile Time
How to Convert a String Literal to Unsigned Char Array in Visual C++