Sse2 Intrinsics - Comparing Unsigned Integers

SSE2 intrinsics - comparing unsigned integers

One way of implementing compares for unsigned 8 bit vectors is to exploit _mm_max_epu8, which returns the maximum of unsigned 8 bit int elements. You can compare for equality the (unsigned) maximum value of two elements with one of the source elements and then return the appropriate result. This translates to 2 instructions for >= or <=, and 3 instructions for > or <.

Example code:

#include <stdio.h>
#include <emmintrin.h>    // SSE2

#define _mm_cmpge_epu8(a, b) \
        _mm_cmpeq_epi8(_mm_max_epu8(a, b), a)

#define _mm_cmple_epu8(a, b) _mm_cmpge_epu8(b, a)

#define _mm_cmpgt_epu8(a, b) \
        _mm_xor_si128(_mm_cmple_epu8(a, b), _mm_set1_epi8(-1))

#define _mm_cmplt_epu8(a, b) _mm_cmpgt_epu8(b, a)

int main(void)
{
    __m128i va = _mm_setr_epi8(0,   0,   1,   1,   1, 127, 127, 127, 128, 128, 128, 254, 254, 254, 255, 255);
    __m128i vb = _mm_setr_epi8(0, 255,   0,   1, 255,   0, 127, 255,   0, 128, 255,   0, 254, 255,   0, 255);

    __m128i v_ge = _mm_cmpge_epu8(va, vb);
    __m128i v_le = _mm_cmple_epu8(va, vb);
    __m128i v_gt = _mm_cmpgt_epu8(va, vb);
    __m128i v_lt = _mm_cmplt_epu8(va, vb);

    printf("va   = %4vhhu\n", va);
    printf("vb   = %4vhhu\n", vb);
    printf("v_ge = %4vhhu\n", v_ge);
    printf("v_le = %4vhhu\n", v_le);
    printf("v_gt = %4vhhu\n", v_gt);
    printf("v_lt = %4vhhu\n", v_lt);

    return 0;
}

Compile and run:

$ gcc -Wall _mm_cmplt_epu8.c && ./a.out 
va   =    0    0    1    1    1  127  127  127  128  128  128  254  254  254  255  255
vb   =    0  255    0    1  255    0  127  255    0  128  255    0  254  255    0  255
v_ge =  255    0  255  255    0  255  255    0  255  255    0  255  255    0  255  255
v_le =  255  255    0  255  255    0  255  255    0  255  255    0  255  255    0  255
v_gt =    0    0  255    0    0  255    0    0  255    0    0  255    0    0  255    0
v_lt =    0  255    0    0  255    0    0  255    0    0  255    0    0  255    0    0

SSE2 intrinsics - find max of two unsigned short vectors

One (somewhat inefficient) way of doing it is to offset the input values by 0x8000 and then add this offset back to the result, e.g.:

#ifndef __SSE4_1__
inline __m128i _mm_max_epu16(const __m128i v0, const __m128i v1)
{
    return _mm_add_epi16(
               _mm_max_epi16(
                   _mm_sub_epi16(v0, _mm_set1_epi16(0x8000)),
                   _mm_sub_epi16(v1, _mm_set1_epi16(0x8000))),
               _mm_set1_epi16(0x8000));
}
#endif

With gcc or clang this generates one load instruction for the constant and four arithmetic instructions.

Note that you can use _mm_xor_si128 in place of _mm_add_epi16/_mm_sub_epi16, which is arguably clearer in intent, and may give better performance on your target architecture:

#ifndef __SSE4_1__
inline __m128i _mm_max_epu16(const __m128i v0, const __m128i v1)
{
    return _mm_xor_si128(
               _mm_max_epi16(
                   _mm_xor_si128(v0, _mm_set1_epi16(0x8000)),
                   _mm_xor_si128(v1, _mm_set1_epi16(0x8000))),
               _mm_set1_epi16(0x8000));
}
#endif

SSE Compare Packed Unsigned Bytes

It's not possible to do a greather than compare in packed unsigned bytes, I've unpacked the bytes into words (as they were possitive it's like a conversion from unsigned to signed and a extension from byte to word) and compared them using PCMPGTB.

SSE2 intrinsics - comparing 2 __m128i's containing 4 int32's each to see how many are equal

You can AND the compare result with a vector of ones to create a vector of zeros and ones. Then use a horizontal add operation to count the ones. Here are some possibilities.

#include "stdio.h"
#include "stdint.h"
#include "intrin.h"

//----------------------------------------------------------------------------
// non-SSE method (reference for result check)
static int method0 (__m128i value)
    {
    int index, total = 0;
    uint32_t *buffer = (void *) &value;

    for (index = 0; index < 4; index++)
        total += buffer [index] == 0xFFFFFFFF;
    return total;
    }

//----------------------------------------------------------------------------
//
// horizontalAddBytes - return integer total of all 16 bytes in xmm argument
//
static int horizontalAddBytes (__m128i byteArray)
   {
   __m128i total;
   const __m128i zero = _mm_setzero_si128 ();

   total = _mm_sad_epu8 (byteArray, zero);
   return _mm_cvtsi128_si64 (_mm_add_epi32 (total, _mm_shuffle_epi32 (total, 0xAA)));
   }

//----------------------------------------------------------------------------
// requires SSE2
static int method1 (__m128i value)
    {
    return horizontalAddBytes (_mm_srli_epi32 (value, 31));
    }

//----------------------------------------------------------------------------
// requires SSE3
static int method2 (__m128i value)
    {
    __m128 count;
    const __m128 mask = _mm_set1_ps (1);
    count = _mm_and_ps (_mm_castsi128_ps (value), mask);
    count = _mm_hadd_ps (count, count);
    count = _mm_hadd_ps (count, count);
    return _mm_cvtss_si32 (count);
    }

//----------------------------------------------------------------------------
// requires SSSE3
static int method3 (__m128i value)
    {
    __m128i count;
    count = _mm_srli_epi32 (value, 31);
    count = _mm_hadd_epi32 (count, count);
    count = _mm_hadd_epi32 (count, count);
    return _mm_cvtsi128_si32 (count);
    }

//----------------------------------------------------------------------------

static void createTestData (uint32_t *data, int mask)
    {
    int index;
    for (index = 0; index < 4; index++)
        data [index * 4] = (mask & (1 << index)) != 0;
    }

//----------------------------------------------------------------------------

int main (void)
    {
    int index1, index2, expected, result1, result2, result3;
    uint32_t source [16];
    uint32_t reference [16];

    for (index1 = 0; index1 < 16; index1++)
        for (index2 = 0; index2 < 16; index2++)
            {
            __m128i first_4_int32s, second_4_int32s, result;
            createTestData (source, index1);
            createTestData (reference, index2);

            // Load the 4 source int32's (they are actually 4 int32s apart)
            first_4_int32s = _mm_set_epi32(*(source + 12), *(source + 8), *(source + 4), *(source));

            // Load the 4 source int32's (also actually 4 int32s apart)
            second_4_int32s = _mm_set_epi32(*(reference + 12), *(reference + 8), *(reference + 4), *(reference));

            // Compare the int32's
            result = _mm_cmpeq_epi32(first_4_int32s, second_4_int32s);

            expected = method0 (result);
            result1 = method1 (result);
            result2 = method2 (result);
            result3 = method3 (result);
            if (result1 != expected) printf ("method1, index %d,%d expected %d, actual %d\n", index1, index2, expected, result1);
            if (result2 != expected) printf ("method2, index %d,%d expected %d, actual %d\n", index1, index2, expected, result2);
            if (result3 != expected) printf ("method3, index %d,%d expected %d, actual %d\n", index1, index2, expected, result3);
            }

    return 0;
    }

//----------------------------------------------------------------------------

SSE intrinsics: Convert 32-bit floats to UNSIGNED 8-bit integers

There is no direct conversion from float to byte, _mm_cvtps_pi8 is a composite. _mm_cvtps_pi16 is also a composite, and in this case it's just doing some pointless stuff that you undo with the shuffle. They also return annoying __m64's.

Anyway, we can convert to dwords (signed, but that doesn't matter), and then pack (unsigned) or shuffle them into bytes. _mm_shuffle_(e)pi8 generates a pshufb, Core2 45nm and AMD processors aren't too fond of it and you have to get a mask from somewhere.

Either way you don't have to round to the nearest integer first, the convert will do that. At least, if you haven't messed with the rounding mode.

Using packs 1: (not tested) -- probably not useful, packusdw already outputs unsigned words but then packuswb wants signed words again. Kept around because it is referred to elsewhere.

cvtps2dq xmm0, xmm0  
packusdw xmm0, xmm0     ; unsafe: saturates to a different range than packuswb accepts
packuswb xmm0, xmm0
movd somewhere, xmm0

Using different shuffles:

cvtps2dq xmm0, xmm0  
packssdw xmm0, xmm0     ; correct: signed saturation on first step to feed packuswb
packuswb xmm0, xmm0
movd somewhere, xmm0

Using shuffle: (not tested)

cvtps2dq xmm0, xmm0
pshufb xmm0, [shufmask]
movd somewhere, xmm0

shufmask: db 0, 4, 8, 12, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h, 80h

SSE2 integer overflow checking

Here is a somewhat more efficient version of @hirschhornsalz's sum_and_overflow function:

void sum_and_overflow(__v4si a, __v4si b, __v4si& sum, __v4si& overflow)
{
   __v4si sa, sb;

    sum = _mm_add_epi32(a, b);                  // calculate sum
    sa = _mm_xor_si128(sum, a);                 // compare sign of sum with sign of a
    sb = _mm_xor_si128(sum, b);                 // compare sign of sum with sign of b
    overflow = _mm_and_si128(sa, sb);           // get overflow in sign bit
    overflow = _mm_srai_epi32(overflow, 31);    // convert to SIMD boolean (-1 == TRUE, 0 == FALSE)
}

It uses an expression for overflow detection from Hacker's Delight page 27:

sum = a + b;
overflow = (sum ^ a) & (sum ^ b);               // overflow flag in sign bit

Note that the overflow vector will contain the more conventional SIMD boolean values of -1 for TRUE (overflow) and 0 for FALSE (no overflow). If you only need the overflow in the sign bit and the other bits are "don't care" then you can omit the last line of the function, reducing the number of SIMD instructions from 5 to 4.

NB: this solution, as well as the previous solution on which it is based are for signed integer values. A solution for unsigned values will require a slightly different approach (see @Stephen Canon's answer).

What is the most efficient way to do unsigned 64 bit comparison on SSE2?

Translated from Hacker's Delight:

static
__m128i sse2_cmpgt_epu64(__m128i a, __m128i b) {
    __m128i r = _mm_andnot_si128(_mm_xor_si128(b, a), _mm_sub_epi64(b, a));
    r = _mm_or_si128(r, _mm_andnot_si128(b, a));
    return _mm_shuffle_epi32(_mm_srai_epi32(r, 31), _MM_SHUFFLE(3,3,1,1));
}

Concept: If mixed "signs" (unsigned MSBs) then return a else return b - a

(MSB(a) ^ MSB(b)) ? a : b - a; // result in MSB

This makes sense:

if a's MSB is set and b's isn't, a is unsigned above (so MSB(a) is our result)
if b's MSB is set and a's isn't, a is unsigned below (so MSB(a) is our result)
if their MSBs are the same, they values are in the same half of the unsigned range so b-a is effectively a 63-bit subtraction. The MSBs will cancel and the MSB of b-a will be equal to the "borrow" output which tells you if a is strictly above b. (Like the CF flag for scalar sub. jb is jc). So MSB(b-a) is our result.

Note that the SIMD andnot/and/or is a bit-blend, but we only care about the MSB. We broadcast it with srai -> shuffle_epi32, discarding the garbage in lower bits. (Or with SSE3, movshdup as described in @Soont's answer.)

It differs from signed compare:

(MSB(a) ^ MSB(b)) ? ~a : b - a; // result in MSB

If signs are mixed then the sign of ~a is also the sign of b, of course.

Sse2 Intrinsics - Comparing Unsigned Integers