Can't Make Value Propagate Through Carry

Can't make value propagate through carry

Your code is very messy to me. I did (long)num classes many times before (floating,fixed,uint,templated,...) so here are some hints:

  1. Try to setup ALU architecture similar to real HW implementation.

    Most algorithms are written for such environment. It will clean and speed up your code. In some cases I use asm for this but if you want to be not CPU dependent you can use this class of mine

    ALU source in C++:

    //---------------------------------------------------------------------------
    //--- ALU32 class 2.01 ------------------------------------------------------
    //---------------------------------------------------------------------------
    #ifndef _ALU32_h
    #define _ALU32_h
    //---------------------------------------------------------------------------
    //#define _ALU32_no_asm
    //---------------------------------------------------------------------------
    class ALU32
    {
    public:
    BYTE cy;
    ALU32() { cy=0; }
    void sar(DWORD &c); // msb -> [msb...lsb] -> cy shift arithmetic right
    void shl(DWORD &c); // cy <- [msb...lsb] <- 0 shift left
    void shr(DWORD &c); // 0 -> [msb...lsb] -> cy shift right
    void rcl(DWORD &c); // cy <- [msb...lsb] <- cy shift through carry left
    void rcr(DWORD &c); // cy -> [msb...lsb] -> cy shift through carry lright
    void inc(DWORD &c);
    void dec(DWORD &c);
    void add(DWORD &c,DWORD a,DWORD b);
    void sub(DWORD &c,DWORD a,DWORD b);
    void adc(DWORD &c,DWORD a,DWORD b);
    void sbc(DWORD &c,DWORD a,DWORD b);
    void mul(DWORD &ch,DWORD &cl,DWORD a,DWORD b); // (ch,cl) = a*b
    void div(DWORD &c,DWORD &d,DWORD ah,DWORD al,DWORD b); // c = a/b d =a%b
    };
    //---------------------------------------------------------------------------
    void ALU32::inc(DWORD &c) { if (c==0xFFFFFFFF) cy=1; else cy=0; c++; }
    void ALU32::dec(DWORD &c) { if (c==0x00000000) cy=1; else cy=0; c--; }
    //---------------------------------------------------------------------------
    void ALU32::sar(DWORD &c)
    {
    cy=c&1;
    c=((c>>1)&0x7FFFFFFF)|(c&0x80000000);
    }
    //---------------------------------------------------------------------------
    void ALU32::shl(DWORD &c)
    {
    cy=c>>31;
    c=(c<<1)&0xFFFFFFFE;
    }
    //---------------------------------------------------------------------------
    void ALU32::shr(DWORD &c)
    {
    cy=c&1;
    c=(c>>1)&0x7FFFFFFF;
    }
    //---------------------------------------------------------------------------
    void ALU32::rcl(DWORD &c)
    {
    DWORD cy0=cy;
    cy=c>>31;
    c=((c<<1)&0xFFFFFFFE)|cy0;
    }
    //---------------------------------------------------------------------------
    void ALU32::rcr(DWORD &c)
    {
    DWORD cy0=cy;
    cy=c&1;
    c=((c>>1)&0x7FFFFFFF)|(cy0<<31);
    }
    //---------------------------------------------------------------------------
    void ALU32::add(DWORD &c,DWORD a,DWORD b)
    {
    c=a+b;
    cy=DWORD(((a &1)+(b &1) )>> 1);
    cy=DWORD(((a>>1)+(b>>1)+cy)>>31);
    }
    //---------------------------------------------------------------------------
    void ALU32::sub(DWORD &c,DWORD a,DWORD b)
    {
    c=a-b;
    if (a<b) cy=1; else cy=0;
    }
    //---------------------------------------------------------------------------
    void ALU32::adc(DWORD &c,DWORD a,DWORD b)
    {
    c=a+b+cy;
    cy=DWORD(((a &1)+(b &1)+cy)>> 1);
    cy=DWORD(((a>>1)+(b>>1)+cy)>>31);
    }
    //---------------------------------------------------------------------------
    void ALU32::sbc(DWORD &c,DWORD a,DWORD b)
    {
    c=a-b-cy;
    if (cy) { if (a<=b) cy=1; else cy=0; }
    else { if (a< b) cy=1; else cy=0; }
    }
    //---------------------------------------------------------------------------
    void ALU32::mul(DWORD &ch,DWORD &cl,DWORD a,DWORD b)
    {
    #ifdef _ALU32_no_asm
    const int _h=1; // this is MSW,LSW order platform dependent So swap 0,1 if your platform is different
    const int _l=0;
    union _u
    {
    DWORD u32;
    WORD u16[2];
    } u;
    DWORD al,ah,bl,bh;
    DWORD c0,c1,c2;
    // separate 2^16 base digits
    u.u32=a; al=u.u16[_l]; ah=u.u16[_h];
    u.u32=b; bl=u.u16[_l]; bh=u.u16[_h];
    // multiplication (al+ah<<16)*(bl+bh<<16) = al*bl + al*bh<<16 + ah*bl<<16 + ah*bh<<32
    c0=(al*bl);
    add(c1,al*bh,ah*bl);
    c2=(ah*bh)+(cy<<16);
    // add subresults
    add(c0,c0,(c1<<16)&0xFFFF0000); c1=((c1>>16)&0x0000FFFF)+cy;
    add(c1,c1,c2);
    // construct result from (c3,c2,c1,c0)
    ch=c1;
    cl=c0;
    #else
    DWORD _a,_b,_cl,_ch;
    _a=a;
    _b=b;
    asm {
    mov eax,_a
    mov ebx,_b
    mul ebx // H(edx),L(eax) = eax * ebx
    mov _cl,eax
    mov _ch,edx
    }
    cl=_cl;
    ch=_ch;
    #endif
    }
    //---------------------------------------------------------------------------
    void ALU32::div(DWORD &c,DWORD &d,DWORD ah,DWORD al,DWORD b)
    {
    #ifdef _ALU32_no_asm
    DWORD ch,cl,bh,bl,h,l,mh,ml;
    int e;
    // edge cases
    if (!b ){ c=0xFFFFFFFF; d=0xFFFFFFFF; cy=1; return; }
    if (!ah){ c=al/b; d=al%b; cy=0; return; }
    // align a,b for binary long division m is the shifted mask of b lsb
    for (bl=b,bh=0,mh=0,ml=1;bh<0x80000000;)
    {
    e=0; if (ah>bh) e=+1; // e = cmp a,b {-1,0,+1}
    else if (ah<bh) e=-1;
    else if (al>bl) e=+1;
    else if (al<bl) e=-1;
    if (e<=0) break; // a<=b ?
    shl(bl); rcl(bh); // b<<=1
    shl(ml); rcl(mh); // m<<=1
    }
    // binary long division
    for (ch=0,cl=0;;)
    {
    sub(l,al,bl); // a-b
    sbc(h,ah,bh);
    if (cy) // a<b ?
    {
    if (ml==1) break;
    shr(mh); rcr(ml); // m>>=1
    shr(bh); rcr(bl); // b>>=1
    continue;
    }
    al=l; ah=h; // a>=b ?
    add(cl,cl,ml); // c+=m
    adc(ch,ch,mh);
    }
    cy=0; c=cl; d=al;
    if ((ch)||(ah)) cy=1; // overflow
    #else
    DWORD _al,_ah,_b,_c,_d;
    _al=al;
    _ah=ah;
    _b=b;
    asm {
    mov eax,_al
    mov edx,_ah
    mov ebx,_b
    div ebx
    mov _c,eax // eax = H(edx),L(eax) / ebx
    mov _d,edx // edx = H(edx),L(eax) % ebx
    }
    c=_c;
    d=_d;
    #endif
    }
    //---------------------------------------------------------------------------
    #endif
    //---------------------------------------------------------------------------
    • mul and div are switchable between fast CPU assembly and slower C++ implementation with the #define _ALU32_no_asm
    • DWORD is 32 bit unsigned int and can be defined like typedef unsigned __int32 DWORD;
  2. So now if you want to add two arrays (fixed size N)

    It can be done like this:

    ALU32 alu;
    DWORD a[N],b[N],c[N]; // a[0] is LSB and a[N-1] is MSB

    alu.add(c[0],a[0],b[0]);
    for (int i=1;i<N;i++) alu.adc(c[i],a[i],b[i]);
    // here c[] = a[] + b[]

    it is a good idea to use the biggest base you can to improve speed. If you still need 8 bit ALU this can be also easily rewritten and even simplified due to direct access to carry. You can use 16 or 32 bit variables and extract 9th bit as carry directly from sub-results (looks like you are doing it).

  3. Your problem (copied from comment)

    My bet is that your problem is here:

    if (i<lhs.nbytes)
    {
    if (ret.data[i].data == 255 && ret.data[i + 1].carry == 1) increment(&trhs, i + 1);
    ret.data[i].data += ret.data[i + 1].carry;
    }

    carry should be applied always but the first time (you do it always but the last time). This also reveals other possibility how is your number stored?

    • data[0] is the LSB or MSB (low/most significant bit/byte...)?

    You have to start adding from lowest digits

    • so either you just applying carry the other way around
    • or you are adding from highest to lowest digits

    but booth are incorrect.

PS. in case you need 32 bit ALU style multiplication without asm in pure C/C++ see this link (but after last update the code here already contains such mul,div):

  • Building a logarithm function in C without using float type

CUDA - PTX carry propagation

The only data dependencies affecting an asm() statement are those that are explicitly expressed by the variable bindings. Note that you can bind register operands, but not the condition codes. Since in this code the result of __uaddo(a, b) is immediately being overwritten, the compiler determines that it does not contribute to the observable results, is therefore "dead code" and can be eliminated. This is easily checked by examining the generated machine code (SASS) for a release build with cuobjdump --dump-sass.

If we had slightly different code that does not allow the compiler to eliminate the code for __uaddo() outright, there would still be the issue that the compiler can schedule any instructions it likes between the code generated for __uaddo() and __uaddc(), and such instructions could destroy any setting of the carry flag due to __uaddo().

As a consequence, if one plans to use the carry flag for multi-word arithmetic, both carry-generating and carry-consuming instructions must occur in the same asm() statement. A worked example can be found in this answer that shows how to add 128-bit operands. Alternatively, if two separate asm() statements must be used, one could export the carry flag setting from the earlier one into a C variable, then import it into the subsequent asm() statement from there. I can't think of many situations where this would be practical, as the performance advantage of using the carry flag is likely lost.

Karatsuba Algorithm Overflow

lest consider positive numbers only for starter. lets have 8 bit numbers/digits x0,x1,y0,y1.

When we apply 8bit multiplication:

x0*y0 -> 8bit * 8bit -> 16 bit

It will produce up to 16 bit result. The top value is:

FFh*FFh = FE01h
255*255 = 65025

Now if we return to Karatsuba let

X = x0 + x1<<8
Y = y0 + y1<<8
Z = X*Y = z0 + z1<<8 + z2<<16

Now lets look at the bit width of zi

z2 = x1*y1         -> 16 bit
z1 = x1*y0 + x0*y1 -> 17 bit
z0 = x0*y0 -> 16 bit

Note that z1 is 17 bit as top value is

65025+65025 = 130050

So each of the zi overflows the 8bit. To handle that you simply take only the lowest 8 bits and the rest add to higher digit (like propagation of Carry). So:

z1 += z0>>8;   z0 &= 255;
z2 += z1>>8; z1 &= 255;
z3 = z2>>8;
Z = z0 + z1<<8 + z2<<16 + z3<<24

But usually HW implementation of the multiplication handles this on its own and give you result as two words instead of one. see Cant make value propagate through carry

So the result of 16 bit multiplication is 32 bit. Beware that for adding the 8bit sub results you need at least 10 bits as we are adding 3 numbers together or add and propagate them one by one with 9 bits (or 8 bits + 1 carry).

If you add signed values to this you need another one bit for sign. To avoid it remember signs of operands and use abs values ... and set the sign of result depending on the original signs.

For more details see these:

  • C++ 32bit * 32bit multiplication without asm
  • Fast bignum square computation

Modulo operation by bit operation in C

As @Uduru already pointed out, sum may become larger than 2^31 and therefore, left shifting will result in an overflow.

To prevent this, remember the following: left shifting by 1 is the same as multiplying by 2. So, (sum << 1) % c is the same as (sum * 2) % c. Now, the rules for modulo state the following:

(a * b) mod c == ((a mod c) * (b mod c)) mod c. 

So you might change the code to the following.

sum = ((sum % c) << 1) % c + a * ((b >> i) & 1);

Because c is guaranteed to be smaller than 2^31 (according to the quoted part), sum % c is also guaranteed to be smaller than 2^31.

Binary addition carrying from left to right

Emulating a Kogge-Stone adder, with the shift direction reversed, gives a nice algorithm,

uint64_t p = x ^ y;
uint64_t g = x & y;

g |= p & (g >> 1);
p &= p >> 1;

g |= p & (g >> 2);
p &= p >> 2;

g |= p & (g >> 4);
p &= p >> 4;

g |= p & (g >> 8);
p &= p >> 8;

g |= p & (g >> 16);
p &= p >> 16;

g |= p & (g >> 32);

uint64_t result = x ^ y ^ (g >> 1);

How to bitwise rotate left and rotate right on arbitrary sized number ranges?

You need to properly mask your sub results to throw away overlapping bits. Also note that <<,>> behavior is platform dependent so to avoid undefined behavior and logical collisions add proper masks...

I see it like this (in C++):

//---------------------------------------------------------------------------
#ifndef _WINDEF_
typedef unsigned __int32 DWORD; // define DWORD if windows header not present
#endif
//---------------------------------------------------------------------------
DWORD rol(DWORD x,int shift,int bits)
{
// masks | bits |
DWORD m0=(1<<(bits-shift))-1; // |0000000|11111111111|
// | shift | |
DWORD m1=(1<<shift)-1; // |00000000000|1111111|
// | | shift |
return ((x&m0)<<shift) | ((x>>(bits-shift))&m1);
}
//---------------------------------------------------------------------------
DWORD ror(DWORD x,int shift,int bits)
{
// masks | bits |
DWORD m0=(1<<(bits-shift))-1; // |0000000|11111111111|
// | shift | |
DWORD m1=(1<<shift)-1; // |00000000000|1111111|
// | | shift |
return ((x>>shift)&m0) | ((x&m1)<<(bits-shift));
}
//---------------------------------------------------------------------------

Where bits is the number of bits you want to have/emulate, shift is number of bits to shift (like x<<shift or x>>shift in C++) ...

Continuous 11 bit shift by 1 bit (rol on left, ror on right) for 1111011b:

10987654321098765432109876543210  10987654321098765432109876543210
-------------------------------- --------------------------------
00000000000000000000000001111011b 00000000000000000000000001111011b
00000000000000000000000011110110b 00000000000000000000010000111101b
00000000000000000000000111101100b 00000000000000000000011000011110b
00000000000000000000001111011000b 00000000000000000000001100001111b
00000000000000000000011110110000b 00000000000000000000010110000111b
00000000000000000000011101100001b 00000000000000000000011011000011b
00000000000000000000011011000011b 00000000000000000000011101100001b
00000000000000000000010110000111b 00000000000000000000011110110000b
00000000000000000000001100001111b 00000000000000000000001111011000b
00000000000000000000011000011110b 00000000000000000000000111101100b
00000000000000000000010000111101b 00000000000000000000000011110110b
00000000000000000000000001111011b 00000000000000000000000001111011b

The same for 2 bit shift:

10987654321098765432109876543210  10987654321098765432109876543210
-------------------------------- --------------------------------
00000000000000000000000001111011b 00000000000000000000000001111011b
00000000000000000000000111101100b 00000000000000000000011000011110b
00000000000000000000011110110000b 00000000000000000000010110000111b
00000000000000000000011011000011b 00000000000000000000011101100001b
00000000000000000000001100001111b 00000000000000000000001111011000b
00000000000000000000010000111101b 00000000000000000000000011110110b
00000000000000000000000011110110b 00000000000000000000010000111101b
00000000000000000000001111011000b 00000000000000000000001100001111b
00000000000000000000011101100001b 00000000000000000000011011000011b
00000000000000000000010110000111b 00000000000000000000011110110000b
00000000000000000000011000011110b 00000000000000000000000111101100b
00000000000000000000000001111011b 00000000000000000000000001111011b
00000000000000000000000111101100b 00000000000000000000011000011110b
00000000000000000000011110110000b 00000000000000000000010110000111b
00000000000000000000011011000011b 00000000000000000000011101100001b
00000000000000000000001100001111b 00000000000000000000001111011000b
00000000000000000000010000111101b 00000000000000000000000011110110b
00000000000000000000000011110110b 00000000000000000000010000111101b
00000000000000000000001111011000b 00000000000000000000001100001111b
00000000000000000000011101100001b 00000000000000000000011011000011b
00000000000000000000010110000111b 00000000000000000000011110110000b
00000000000000000000011000011110b 00000000000000000000000111101100b
00000000000000000000000001111011b 00000000000000000000000001111011b

Tested in VCL like this:

//---------------------------------------------------------------------------
AnsiString dec2bin(DWORD x)
{
AnsiString s;
int i;
s.SetLength(33);
for (i=0;i<32;i++,x>>=1) s[32-i]='0'+(x&1);
s[33]='b';
return s;
}
//---------------------------------------------------------------------------
__fastcall TForm1::TForm1(TComponent* Owner):TForm(Owner)
{
int i;
DWORD x=123,y=x;
mm_log->Lines->Add("10987654321098765432109876543210 10987654321098765432109876543210");
mm_log->Lines->Add("-------------------------------- --------------------------------");
for (i=0;i<32;i++)
{
mm_log->Lines->Add(dec2bin(x)+" "+dec2bin(y));
x=rol(x,2,11);
y=ror(y,2,11);
}
}
//-------------------------------------------------------------------------

I just tweaked bitshifts from my ALU32 from here:

  • Can't make value propagate through carry

to match your case...

Notes

In case shift is big you should add shift%=bits to both functions. Also if shift is negative use the other function ...

//---------------------------------------------------------------------------
#ifndef _WINDEF_
typedef unsigned __int32 DWORD; // define DWORD if windows header not present
#endif
//---------------------------------------------------------------------------
DWORD rol(DWORD x,int shift,int bits);
DWORD ror(DWORD x,int shift,int bits);
//---------------------------------------------------------------------------
DWORD rol(DWORD x,int shift,int bits)
{
if (shift<0) return ror(x,-shift,bits);
shift%=bits;
// masks | bits |
DWORD m0=(1<<(bits-shift))-1; // |0000000|11111111111|
// | shift | |
DWORD m1=(1<<shift)-1; // |00000000000|1111111|
// | | shift |
return ((x&m0)<<shift) | ((x>>(bits-shift))&m1);
}
//---------------------------------------------------------------------------
DWORD ror(DWORD x,int shift,int bits)
{
if (shift<0) return rol(x,-shift,bits);
shift%=bits;
// masks | bits |
DWORD m0=(1<<(bits-shift))-1; // |0000000|11111111111|
// | shift | |
DWORD m1=(1<<shift)-1; // |00000000000|1111111|
// | | shift |
return ((x>>shift)&m0) | ((x&m1)<<(bits-shift));
}
//---------------------------------------------------------------------------


Related Topics



Leave a reply



Submit