Efficient Multiply/Divide of Two 128-Bit Integers on X86 (No 64-Bit)

Signed 64-Bit multiply and 128-Bit Divide on x86 in assembly

First, the MUL64 function does not work 100%

If you try to do 0xFFFFFFFFFFFFFFFF x 0xFFFFFFFFFFFFFFFF, the Hi 64-bit result is 0xFFFFFFFeFFFFFFFF, it should be 0xFFFFFFFFFFFFFFFe

To fix this, the carry flag after the POPFD instruction should be added to EDX, the highest 32-bit part of the result. Now following Peter Cordes advice, remove the push and pops of EAX/ECX/EDX. Finally use setc BL and movzx EBX,BL to save the flag. Note: you cannot easily use xor EBX,EBX to zero it because xor effects the flags. We use movzx because its faster than add BL,0xFF and add is faster than adc based on Skylake specs.

The Result:

MUL64 PROC, A:QWORD, B:QWORD, pu128:DWORD
push EBX
push EDI
mov EDI,pu128
; LO(A) * LO(B)
mov EAX,DWORD PTR A
mov EDX,DWORD PTR B
mul EDX
mov [EDI],EAX ; Save the partial product.
mov ECX,EDX
; LO(A) * HI(B)
mov EAX,DWORD PTR A
mov EDX,DWORD PTR B+4
mul EDX
add EAX,ECX
adc EDX,0
mov EBX,EAX
mov ECX,EDX
; HI(A) * LO(B)
mov EAX,DWORD PTR A+4
mov EDX,DWORD PTR B
mul EDX
add EAX,EBX
adc ECX,EDX
setc BL ; Save carry.
movzx EBX,BL ; Zero-Extend carry.
mov [EDI+4],EAX ; Save the partial product.
; HI(A) * HI(B)
mov EAX,DWORD PTR A+4
mov EDX,DWORD PTR B+4
mul EDX
add EDX,EBX ; Add carry from above.
add EAX,ECX
adc EDX,0
mov [EDI+8],EAX ; Save the partial product.
mov [EDI+12],EDX ; Save the partial product.
pop EDI
pop EBX
ret 20
MUL64 ENDP

Now, to make a signed version of the function use this formula:

my128.Hi -= (((A < 0) ? B : 0) + ((B < 0) ? A : 0));

The Result:

IMUL64 PROC, A:SQWORD, B:SQWORD, pi128:DWORD
push EBX
push EDI
mov EDI,pi128
; LO(A) * LO(B)
mov EAX,DWORD PTR A
mov EDX,DWORD PTR B
mul EDX
mov [EDI],EAX ; Save the partial product.
mov ECX,EDX
; LO(A) * HI(B)
mov EAX,DWORD PTR A
mov EDX,DWORD PTR B+4
mul EDX
add EAX,ECX
adc EDX,0
mov EBX,EAX
mov ECX,EDX
; HI(A) * LO(B)
mov EAX,DWORD PTR A+4
mov EDX,DWORD PTR B
mul EDX
add EAX,EBX
adc ECX,EDX
setc BL ; Save carry.
movzx EBX,BL ; Zero-Extend carry.
mov [EDI+4],EAX ; Save the partial product.
; HI(A) * HI(B)
mov EAX,DWORD PTR A+4
mov EDX,DWORD PTR B+4
mul EDX
add EDX,EBX ; Add carry from above.
add EAX,ECX
adc EDX,0
mov [EDI+8],EAX ; Save the partial product.
mov [EDI+12],EDX ; Save the partial product.
; Signed version only:
cmp DWORD PTR A+4,0
jg zero_b
jl use_b
cmp DWORD PTR A,0
jae zero_b
use_b:
mov ECX,DWORD PTR B
mov EBX,DWORD PTR B+4
jmp test_b
zero_b:
xor ECX,ECX
mov EBX,ECX
test_b:
cmp DWORD PTR B+4,0
jg zero_a
jl use_a
cmp DWORD PTR B,0
jae zero_a
use_a:
mov EAX,DWORD PTR A
mov EDX,DWORD PTR A+4
jmp do_last_op
zero_a:
xor EAX,EAX
mov EDX,EAX
do_last_op:
add EAX,ECX
adc EDX,EBX
sub [EDI+8],EAX
sbb [EDI+12],EDX
; End of signed version!
pop EDI
pop EBX
ret 20
IMUL64 ENDP

The DIV128 function should be fine (also probably the fastest) for getting a 128-bit quotient from a 32-bit divisor, but if you need to use a 128-bit divisor then look at this code https://www.codeproject.com/Tips/785014/UInt-Division-Modulus that has an example of using the Binary Shift Algorithm for 128-bit division. It could probably be 3x faster if written in assembly.

To make a signed version of DIV128, first determine if the sign of the divisor and dividend are the same or different. If they are the same, then the result should be positive. If they are different, then the result should be negative. So... Make the dividend and divisor positive if they are negative and call DIV128, after that, negate the results if the signs were different.

Here is some example code written in C++

VOID IDIV128(PSDQWORD Dividend, PSDQWORD Divisor, PSDQWORD Quotient, PSDQWORD Remainder)
{
    BOOL Negate;
    DQWORD DD, DV;

    Negate = TRUE;

    // Use local DD and DV so Dividend and Divisor dont get currupted.
    DD.Lo = Dividend->Lo;
    DD.Hi = Dividend->Hi;
    DV.Lo = Divisor->Lo;
    DV.Hi = Divisor->Hi;

    // if the signs are the same then: Negate = FALSE;
    if ((DD.Hi & 0x8000000000000000) == (DV.Hi & 0x8000000000000000)) Negate = FALSE;

    // Covert Dividend and Divisor to possitive if negative: (negate)
    if (DD.Hi & 0x8000000000000000) NEG128((PSDQWORD)&DD);
    if (DV.Hi & 0x8000000000000000) NEG128((PSDQWORD)&DV);

    DIV128(&DD, &DV, (PDQWORD)Quotient, (PDQWORD)Remainder);

    if (Negate == TRUE)
    {
        NEG128(Quotient);
        NEG128(Remainder);
    }
}

EDIT:

Following Peter Cordes advice, we can optimize MUL64/IMUL64 even more. Look at the comments for specific changes being made. I have also replaced MUL64 PROC, A:QWORD, B:QWORD, pu128:DWORD with MUL64@20: and IMUL64@20: to eliminate unnecessary use of EBP that masm adds. I also optimized the sign-fixing work for IMUL64.

The current .asm file for MUL64/IMUL64

.MODEL flat, stdcall

EXTERNDEF  MUL64@20     :PROC
EXTERNDEF  IMUL64@20    :PROC

.CODE

MUL64@20:
push EBX
push EDI

;            -----------------
;            |     pu128     |
;            |---------------|
;            |       B       |
;            |---------------|
;            |       A       |
;            |---------------|
;            |  ret address  |
;            |---------------|
;            |      EBX      |
;            |---------------|
;    ESP---->|      EDI      |
;            -----------------

A       TEXTEQU   <[ESP+12]>
B       TEXTEQU   <[ESP+20]>
pu128   TEXTEQU   <[ESP+28]>

mov EDI,pu128
; LO(A) * LO(B)
mov EAX,DWORD PTR A
mul DWORD PTR B
mov [EDI],EAX ; Save the partial product.
mov ECX,EDX
; LO(A) * HI(B)
mov EAX,DWORD PTR A
mul DWORD PTR B+4
add EAX,ECX
adc EDX,0
mov EBX,EAX
mov ECX,EDX
; HI(A) * LO(B)
mov EAX,DWORD PTR A+4
mul DWORD PTR B
add EAX,EBX
adc ECX,EDX
setc BL ; Save carry.
mov [EDI+4],EAX ; Save the partial product.
; HI(A) * HI(B)
mov EAX,DWORD PTR A+4
mul DWORD PTR B+4
add EAX,ECX
movzx ECX,BL ; Zero-Extend saved carry from above.
adc EDX,ECX
mov [EDI+8],EAX ; Save the partial product.
mov [EDI+12],EDX ; Save the partial product.
pop EDI
pop EBX
ret 20

IMUL64@20:
push EBX
push EDI

;            -----------------
;            |     pi128     |
;            |---------------|
;            |       B       |
;            |---------------|
;            |       A       |
;            |---------------|
;            |  ret address  |
;            |---------------|
;            |      EBX      |
;            |---------------|
;    ESP---->|      EDI      |
;            -----------------

A       TEXTEQU   <[ESP+12]>
B       TEXTEQU   <[ESP+20]>
pi128   TEXTEQU   <[ESP+28]>

mov EDI,pi128
; LO(A) * LO(B)
mov EAX,DWORD PTR A
mul DWORD PTR B
mov [EDI],EAX ; Save the partial product.
mov ECX,EDX
; LO(A) * HI(B)
mov EAX,DWORD PTR A
mul DWORD PTR B+4
add EAX,ECX
adc EDX,0
mov EBX,EAX
mov ECX,EDX
; HI(A) * LO(B)
mov EAX,DWORD PTR A+4
mul DWORD PTR B
add EAX,EBX
adc ECX,EDX
setc BL ; Save carry.
mov [EDI+4],EAX ; Save the partial product.
; HI(A) * HI(B)
mov EAX,DWORD PTR A+4
mul DWORD PTR B+4
add EAX,ECX
movzx ECX,BL ; Zero-Extend saved carry from above.
adc EDX,ECX
mov [EDI+8],EAX ; Save the partial product.
mov [EDI+12],EDX ; Save the partial product.
; Signed version only:
mov BL,BYTE PTR B+7
and BL,80H
jz zero_a
mov EAX,DWORD PTR A
mov EDX,DWORD PTR A+4
jmp test_a
zero_a:
xor EAX,EAX
mov EDX,EAX
test_a:
mov BL,BYTE PTR A+7
and BL,80H
jz do_last_op
add EAX,DWORD PTR B
adc EDX,DWORD PTR B+4
do_last_op:
sub [EDI+8],EAX
sbb [EDI+12],EDX
; End of signed version!
pop EDI
pop EBX
ret 20

END

How can I multiply 64 bit operands and get 128 bit result portably?

As I understand the question, you want a portable pure C implementation of 64 bit multiplication, with output to a 128 bit value, stored in two 64 bit values. In which case this article purports to have what you need. That code is written for C++. It doesn't take much to turn it into C code:

void mult64to128(uint64_t op1, uint64_t op2, uint64_t *hi, uint64_t *lo)
{
    uint64_t u1 = (op1 & 0xffffffff);
    uint64_t v1 = (op2 & 0xffffffff);
    uint64_t t = (u1 * v1);
    uint64_t w3 = (t & 0xffffffff);
    uint64_t k = (t >> 32);

    op1 >>= 32;
    t = (op1 * v1) + k;
    k = (t & 0xffffffff);
    uint64_t w1 = (t >> 32);

    op2 >>= 32;
    t = (u1 * op2) + k;
    k = (t >> 32);

    *hi = (op1 * op2) + w1 + k;
    *lo = (t << 32) + w3;
}

An efficient way to do basic 128 bit integer calculations in C++?

Update: Since the OP hasn't accepted an answer yet <hint><hint>, I've attached a bit more code.

Using the libraries discussed above is probably a good idea. While you might only need a few functions today, eventually you may find that you need one more. Then one more after that. Until eventually you end up writing, debugging and maintaining your own 128bit math library. Which is a waste of your time and effort.

That said. If you are determined to roll your own:

1) The cuda question you asked previously already has c code for multiplication. Was there some problem with it?

2) The shift probably won't benefit from using asm, so a c solution makes sense to me here as well. ~~Although if performance is really an issue here, I'd see if the Edison supports SHLD/SHRD, which might make this a bit faster. Otherwise, m~~ Maybe an approach like this?

my_uint128_t lshift_uint128 (const my_uint128_t a, int b)
{
   my_uint128_t res;
   if (b < 32) {    
      res.x = a.x << b;
      res.y = (a.y << b) | (a.x >> (32 - b));
      res.z = (a.z << b) | (a.y >> (32 - b));
      res.w = (a.w << b) | (a.z >> (32 - b));
   } elseif (b < 64) {
      ...
   }

   return res;
}

Update: Since it appears that the Edison may support SHLD/SHRD, here's an alternative which might be more performant than the 'c' code above. As with all code purporting to be faster, you should test it.

inline
unsigned int __shld(unsigned int into, unsigned int from, unsigned int c)
{
   unsigned int res;

   if (__builtin_constant_p(into) &&
       __builtin_constant_p(from) &&
       __builtin_constant_p(c))
   {
      res = (into << c) | (from >> (32 - c));
   }
   else
   {
      asm("shld %b3, %2, %0"
          : "=rm" (res)
          : "0" (into), "r" (from), "ic" (c)
          : "cc");
   }

   return res;
}

inline
unsigned int __shrd(unsigned int into, unsigned int from, unsigned int c)
{
   unsigned int res;

   if (__builtin_constant_p(into) && 
       __builtin_constant_p(from) && 
       __builtin_constant_p(c))
   {
      res = (into >> c) | (from << (32 - c));
   }
   else
   {
      asm("shrd %b3, %2, %0"
          : "=rm" (res)
          : "0" (into), "r" (from), "ic" (c)
          : "cc");
   }

   return res;
}

my_uint128_t lshift_uint128 (const my_uint128_t a, unsigned int b)
{
   my_uint128_t res;

   if (b < 32) {
      res.x = a.x << b;
      res.y = __shld(a.y, a.x, b);
      res.z = __shld(a.z, a.y, b);
      res.w = __shld(a.w, a.z, b);
   } else if (b < 64) {
      res.x = 0;
      res.y = a.x << (b - 32);
      res.z = __shld(a.y, a.x, b - 32);
      res.w = __shld(a.z, a.y, b - 32);
   } else if (b < 96) {
      res.x = 0;
      res.y = 0;
      res.z = a.x << (b - 64);
      res.w = __shld(a.y, a.x, b - 64);
   } else if (b < 128) {
      res.x = 0;
      res.y = 0;
      res.z = 0;
      res.w = a.x << (b - 96);
   } else {
      memset(&res, 0, sizeof(res));
   }

   return res;
}

my_uint128_t rshift_uint128 (const my_uint128_t a, unsigned int b)
{
   my_uint128_t res;

   if (b < 32) {
      res.x = __shrd(a.x, a.y, b);
      res.y = __shrd(a.y, a.z, b);
      res.z = __shrd(a.z, a.w, b);
      res.w = a.w >> b;
   } else if (b < 64) {
      res.x = __shrd(a.y, a.z, b - 32);
      res.y = __shrd(a.z, a.w, b - 32);
      res.z = a.w >> (b - 32);
      res.w = 0;
   } else if (b < 96) {
      res.x = __shrd(a.z, a.w, b - 64);
      res.y = a.w >> (b - 64);
      res.z = 0;
      res.w = 0;
   } else if (b < 128) {
      res.x = a.w >> (b - 96);
      res.y = 0;
      res.z = 0;
      res.w = 0;
   } else {
      memset(&res, 0, sizeof(res));
   }

   return res;
}

3) The addition may benefit from asm. You could try this:

struct my_uint128_t
{
   unsigned int x;
   unsigned int y;
   unsigned int z;
   unsigned int w;
};

my_uint128_t add_uint128 (const my_uint128_t a, const my_uint128_t b)
{
   my_uint128_t res;

    asm ("addl %5, %[resx]\n\t"
         "adcl %7, %[resy]\n\t"
         "adcl %9, %[resz]\n\t"
         "adcl %11, %[resw]\n\t"
         : [resx] "=&r" (res.x), [resy] "=&r" (res.y), 
           [resz] "=&r" (res.z), [resw] "=&r" (res.w)
         : "%0"(a.x), "irm"(b.x), 
           "%1"(a.y), "irm"(b.y), 
           "%2"(a.z), "irm"(b.z), 
           "%3"(a.w), "irm"(b.w)
         : "cc");

   return res;
}

I just dashed this off, so use at your own risk. I don't have an Edison, but this works with x86.

Update: If you are just doing accumulation (think to += from instead of the code above which is c = a + b), this code might serve you better:

inline
void addto_uint128 (my_uint128_t *to, const my_uint128_t from)
{
   asm ("addl %[fromx], %[tox]\n\t"
        "adcl %[fromy], %[toy]\n\t"
        "adcl %[fromz], %[toz]\n\t"
        "adcl %[fromw], %[tow]\n\t"
        : [tox] "+&r"(to->x), [toy] "+&r"(to->y), 
          [toz] "+&r"(to->z), [tow] "+&r"(to->w)
        : [fromx] "irm"(from.x), [fromy] "irm"(from.y), 
          [fromz] "irm"(from.z), [fromw] "irm"(from.w)
        : "cc");
}

Divide 64-bit integers as though the dividend is shifted left 64 bits, without having 128-bit types

This can be done without a multi-word division

Suppose we want to do ⌊2⁶⁴ × ^x⁄_y⌋ then we can transform the expression like this

$Unicode math: ⌊(2^64 x)/y⌋=⌊(⌊2^64/y⌋+{2^64/y})x⌋=⌊2^64/y⌋x+⌊{2^64/y}x┤$

The first term is trivially done as ((-y)/y + 1)*x as per this question How to compute 2⁶⁴/n in C?

The second term is equivalent to (2⁶⁴ % y)/y*x and is a little bit trickier. I've tried various ways but all need 128-bit multiplication and 128/64 division if using only integer operations. That can be done using the algorithms to calculate MulDiv64(a, b, c) = a*b/c in the below questions

Most accurate way to do a combined multiply-and-divide operation in 64-bit?
How to multiply a 64 bit integer by a fraction in C++ while minimizing error?
(a * b) / c MulDiv and dealing with overflow from intermediate multiplication
How can I multiply and divide 64-bit ints accurately?

However they may be slow, and if you have those functions you calculate the whole expression more easily like MulDiv64(x, UINT64_MAX, y) + x/y + something without messing up with the above transformation

Using long double seems to be the easiest way if it has 64 bits of precision or more. So now it can be done by (2⁶⁴ % y)/(long double)y*x

uint64_t divHi64(uint64_t x, uint64_t y) {
    uint64_t mod_y = UINT64_MAX % y + 1;
    uint64_t result = ((-y)/y + 1)*x;
    if (mod_y != y)
        result += (uint64_t)((mod_y/(long double)y)*x);
    return result;
}

The overflow check was omitted for simplification. A slight modification will be needed if you need signed division

If you're targeting 64-bit Windows but you're using MSVC which doesn't have __int128 then now it has a 128-bit/64-bit divide intrinsic which simplifies the job significantly without a 128-bit integer type. You still need to handle overflow though because the div instruction will throw an exception on that case

uint64_t divHi64(uint64_t x, uint64_t y) {
    uint64_t high, remainder;
    uint64_t low = _umul128(UINT64_MAX, y, &high);
    if (x <= high /* && 0 <= low */)
        return _udiv128(x, 0, y, &remainder);
    // overflow case
    errno = EOVERFLOW;
    return 0;
}

The overflow checking above is can be simplified to checking whether x < y, because if x >= y then the result will overflow

Getting the high part of 64 bit integer multiplication

If you're using gcc and the version you have supports 128 bit numbers (try using __uint128_t) then performing the 128 multiply and extracting the upper 64 bits is likely to be the most efficient way of getting the result.

If your compiler doesn't support 128 bit numbers, then Yakk's answer is correct. However, it may be too brief for general consumption. In particular, an actual implementation has to be careful of overflowing 64 bit integers.

The simple and portable solution he proposes is to break each of a and b into 2 32-bit numbers and then multiply those 32 bit numbers using the 64 bit multiply operation. If we write:

uint64_t a_lo = (uint32_t)a;
uint64_t a_hi = a >> 32;
uint64_t b_lo = (uint32_t)b;
uint64_t b_hi = b >> 32;

then it is obvious that:

a = (a_hi << 32) + a_lo;
b = (b_hi << 32) + b_lo;

and:

a * b = ((a_hi << 32) + a_lo) * ((b_hi << 32) + b_lo)
      = ((a_hi * b_hi) << 64) +
        ((a_hi * b_lo) << 32) +
        ((b_hi * a_lo) << 32) +
          a_lo * b_lo

provided the calculation is performed using 128 bit (or greater) arithmetic.

But this problem requires that we perform all the calculcations using 64 bit arithmetic, so we have to worry about overflow.

Since a_hi, a_lo, b_hi, and b_lo are all unsigned 32 bit numbers, their product will fit in an unsigned 64 bit number without overflow. However, the intermediate results of the above calculation will not.

The following code will implement mulhi(a, b) when the mathemetics must be performed modulo 2^64:

uint64_t    a_lo = (uint32_t)a;
uint64_t    a_hi = a >> 32;
uint64_t    b_lo = (uint32_t)b;
uint64_t    b_hi = b >> 32;

uint64_t    a_x_b_hi =  a_hi * b_hi;
uint64_t    a_x_b_mid = a_hi * b_lo;
uint64_t    b_x_a_mid = b_hi * a_lo;
uint64_t    a_x_b_lo =  a_lo * b_lo;

uint64_t    carry_bit = ((uint64_t)(uint32_t)a_x_b_mid +
                         (uint64_t)(uint32_t)b_x_a_mid +
                         (a_x_b_lo >> 32) ) >> 32;

uint64_t    multhi = a_x_b_hi +
                     (a_x_b_mid >> 32) + (b_x_a_mid >> 32) +
                     carry_bit;

return multhi;

As Yakk points out, if you don't mind being off by +1 in the upper 64 bits, you can omit the calculation of the carry bit.

Fastest way to calculate a 128-bit integer modulo a 64-bit integer

You can use the division version of Russian Peasant Multiplication.

To find the remainder, execute (in pseudo-code):

X = B;

while (X <= A/2)
{
    X <<= 1;
}

while (A >= B)
{
    if (A >= X)
        A -= X;
    X >>= 1;
}

The modulus is left in A.

You'll need to implement the shifts, comparisons and subtractions to operate on values made up of a pair of 64 bit numbers, but that's fairly trivial (likely you should implement the left-shift-by-1 as X + X).

This will loop at most 255 times (with a 128 bit A). Of course you need to do a pre-check for a zero divisor.

How does this 128 bit integer multiplication work in assembly (x86-64)?

What GCC is doing is using the property that signed multiplication can be done using the following formula.

(hi,lo) = unsigned(x*y)
hi -= ((x<0) ? y : 0)  + ((y<0) ? x : 0)

Despite the fact that there is no need to do this since in this case the x86-64 instruction set has a signed 64-bit*64-bit to 128-bit instruction (imul with one operand) this formula is useful in other cases. For example for implementing signed 128-bit multiplication with SSE2/AVX2/AVX512 or for implementing 256-bit multiplication when the instruction set only does 128-bit multiplication (such as with x86-64).

GCC implemented this formula a bit differently though. If we take the sign bit and extend it to the whole word, call this function sign_ext, then the function returns -1 or 0. Then what GCC did is:

hi += sign_ext(x)*y + sign_ext(y)*x

for example sign_ext(x)*y in pseudo-instructions for 64-bit words is

sarq  $63, x    ; sign_ext(x)
imulq   y, x    ; sign_ext(x)*y

So now you ask (or meant to ask):

Why is this formula true?

That's a good qeustion. I asked this same question as well and njuffa wrote

@Zboson: It follows directly from two's complement complement representation. E.g. 32-bit integers -n and -m are represented as unsigned numbers x=2**32-n, y=2**32-m. If you multiply those you have x*y = 2**64 - 2**32*n - 2**32*m + n*m. The middle terms indicate the necessary corrections to the upper half of the product. Working through a simple example using -1*-1 should prove very instructive.