How to Add and Subtract 128 Bit Integers in C or C++ If My Compiler Does Not Support Them

How can I add and subtract 128 bit integers in C or C++ if my compiler does not support them?

If all you need is addition and subtraction, and you already have your 128-bit values in binary form, a library might be handy but isn't strictly necessary. This math is trivial to do yourself.

I don't know what your compiler uses for 64-bit types, so I'll use INT64 and UINT64 for signed and unsigned 64-bit integer quantities.

class Int128
{
public:
    ...
    Int128 operator+(const Int128 & rhs)
    {
        Int128 sum;
        sum.high = high + rhs.high;
        sum.low = low + rhs.low;
        // check for overflow of low 64 bits, add carry to high
        if (sum.low < low)
            ++sum.high;
        return sum;
    }
    Int128 operator-(const Int128 & rhs)
    {
        Int128 difference;
        difference.high = high - rhs.high;
        difference.low = low - rhs.low;
        // check for underflow of low 64 bits, subtract carry to high
        if (difference.low > low)
            --difference.high;
        return difference;
    }

private:
    INT64  high;
    UINT64 low;
};

How to properly add/subtract a 128-bit number (as two uint64_t)?

In grade 1 or 2, you should have learn't how to break down the addition of 1 and 10 into parts, by splitting it into multiple separate additions of tens and units. When dealing with big numbers, the same principals can be applied to compute arithmetic operations on arbitrarily large numbers, by realizing your units are now units of 2^bits, your "tens" are 2^bits larger and so on.

An efficient way to do basic 128 bit integer calculations in C++?

Update: Since the OP hasn't accepted an answer yet <hint><hint>, I've attached a bit more code.

Using the libraries discussed above is probably a good idea. While you might only need a few functions today, eventually you may find that you need one more. Then one more after that. Until eventually you end up writing, debugging and maintaining your own 128bit math library. Which is a waste of your time and effort.

That said. If you are determined to roll your own:

1) The cuda question you asked previously already has c code for multiplication. Was there some problem with it?

2) The shift probably won't benefit from using asm, so a c solution makes sense to me here as well. ~~Although if performance is really an issue here, I'd see if the Edison supports SHLD/SHRD, which might make this a bit faster. Otherwise, m~~ Maybe an approach like this?

my_uint128_t lshift_uint128 (const my_uint128_t a, int b)
{
   my_uint128_t res;
   if (b < 32) {    
      res.x = a.x << b;
      res.y = (a.y << b) | (a.x >> (32 - b));
      res.z = (a.z << b) | (a.y >> (32 - b));
      res.w = (a.w << b) | (a.z >> (32 - b));
   } elseif (b < 64) {
      ...
   }

   return res;
}

Update: Since it appears that the Edison may support SHLD/SHRD, here's an alternative which might be more performant than the 'c' code above. As with all code purporting to be faster, you should test it.

inline
unsigned int __shld(unsigned int into, unsigned int from, unsigned int c)
{
   unsigned int res;

   if (__builtin_constant_p(into) &&
       __builtin_constant_p(from) &&
       __builtin_constant_p(c))
   {
      res = (into << c) | (from >> (32 - c));
   }
   else
   {
      asm("shld %b3, %2, %0"
          : "=rm" (res)
          : "0" (into), "r" (from), "ic" (c)
          : "cc");
   }

   return res;
}

inline
unsigned int __shrd(unsigned int into, unsigned int from, unsigned int c)
{
   unsigned int res;

   if (__builtin_constant_p(into) && 
       __builtin_constant_p(from) && 
       __builtin_constant_p(c))
   {
      res = (into >> c) | (from << (32 - c));
   }
   else
   {
      asm("shrd %b3, %2, %0"
          : "=rm" (res)
          : "0" (into), "r" (from), "ic" (c)
          : "cc");
   }

   return res;
}

my_uint128_t lshift_uint128 (const my_uint128_t a, unsigned int b)
{
   my_uint128_t res;

   if (b < 32) {
      res.x = a.x << b;
      res.y = __shld(a.y, a.x, b);
      res.z = __shld(a.z, a.y, b);
      res.w = __shld(a.w, a.z, b);
   } else if (b < 64) {
      res.x = 0;
      res.y = a.x << (b - 32);
      res.z = __shld(a.y, a.x, b - 32);
      res.w = __shld(a.z, a.y, b - 32);
   } else if (b < 96) {
      res.x = 0;
      res.y = 0;
      res.z = a.x << (b - 64);
      res.w = __shld(a.y, a.x, b - 64);
   } else if (b < 128) {
      res.x = 0;
      res.y = 0;
      res.z = 0;
      res.w = a.x << (b - 96);
   } else {
      memset(&res, 0, sizeof(res));
   }

   return res;
}

my_uint128_t rshift_uint128 (const my_uint128_t a, unsigned int b)
{
   my_uint128_t res;

   if (b < 32) {
      res.x = __shrd(a.x, a.y, b);
      res.y = __shrd(a.y, a.z, b);
      res.z = __shrd(a.z, a.w, b);
      res.w = a.w >> b;
   } else if (b < 64) {
      res.x = __shrd(a.y, a.z, b - 32);
      res.y = __shrd(a.z, a.w, b - 32);
      res.z = a.w >> (b - 32);
      res.w = 0;
   } else if (b < 96) {
      res.x = __shrd(a.z, a.w, b - 64);
      res.y = a.w >> (b - 64);
      res.z = 0;
      res.w = 0;
   } else if (b < 128) {
      res.x = a.w >> (b - 96);
      res.y = 0;
      res.z = 0;
      res.w = 0;
   } else {
      memset(&res, 0, sizeof(res));
   }

   return res;
}

3) The addition may benefit from asm. You could try this:

struct my_uint128_t
{
   unsigned int x;
   unsigned int y;
   unsigned int z;
   unsigned int w;
};

my_uint128_t add_uint128 (const my_uint128_t a, const my_uint128_t b)
{
   my_uint128_t res;

    asm ("addl %5, %[resx]\n\t"
         "adcl %7, %[resy]\n\t"
         "adcl %9, %[resz]\n\t"
         "adcl %11, %[resw]\n\t"
         : [resx] "=&r" (res.x), [resy] "=&r" (res.y), 
           [resz] "=&r" (res.z), [resw] "=&r" (res.w)
         : "%0"(a.x), "irm"(b.x), 
           "%1"(a.y), "irm"(b.y), 
           "%2"(a.z), "irm"(b.z), 
           "%3"(a.w), "irm"(b.w)
         : "cc");

   return res;
}

I just dashed this off, so use at your own risk. I don't have an Edison, but this works with x86.

Update: If you are just doing accumulation (think to += from instead of the code above which is c = a + b), this code might serve you better:

inline
void addto_uint128 (my_uint128_t *to, const my_uint128_t from)
{
   asm ("addl %[fromx], %[tox]\n\t"
        "adcl %[fromy], %[toy]\n\t"
        "adcl %[fromz], %[toz]\n\t"
        "adcl %[fromw], %[tow]\n\t"
        : [tox] "+&r"(to->x), [toy] "+&r"(to->y), 
          [toz] "+&r"(to->z), [tow] "+&r"(to->w)
        : [fromx] "irm"(from.x), [fromy] "irm"(from.y), 
          [fromz] "irm"(from.z), [fromw] "irm"(from.w)
        : "cc");
}

Is there a library or other way to do 128-bit math operations?

Check out the GNU Multiple Precision Arithmetic Library.

Is there a 128 bit integer in C++?

GCC and Clang support __int128

What gcc versions support the __int128 intrinsic type?

Not sure about the first version, but you can test for the __SIZEOF_INT128__ macro - which is (typically) 16 if defined.

What type should I use for a 128-bit number in in .NET?

It's here in System.Numerics. "The BigInteger type is an immutable type that represents an arbitrarily large integer whose value in theory has no upper or lower bounds."

var i = System.Numerics.BigInteger.Parse("10000000000000000000000000000000");

How to combine 4 uint32_t ints into a whole 128 bit int and return

I'd use union:

#include <stdio.h>
#include <stdint.h>
#include <inttypes.h>

int main(void) {
    union {
        struct {
            uint32_t v1;
            uint32_t v2;
            uint32_t v3;
            uint32_t v4;
        } __attribute__((packed));
        unsigned __int128 i128;
    } t128;
    t128.v1 = 0x22221111;
    t128.v2 = 0x44443333;
    t128.v3 = 0x66665555;
    t128.v4 = 0x88887777;

    printf("0x%016"PRIx64"%016"PRIx64"\n", (uint64_t)(t128.i128 >> 64), (uint64_t)t128.i128);

    return 0;
}

This gives:

0x88887777666655554444333322221111

as a result on intel (little-endian) architecture.

Can double be used to store and safely retrieve 128 bit IPv6?

Technically you could, if sizeof(double) * CHAR_BITS ≥ 128 (it's 64bits on my machine), but why would you do that? Instead of reinventing the wheel use sockaddr_in6, or, if you must, an array of uint8_t or a std::bitset.

Related answer: Efficient way to store IPv4/IPv6 addresses