Emulate "Double" Using 2 "Float"S

Emulate double using 2 float s

double-float is a technique that uses pairs of single-precision numbers to achieve almost twice the precision of single precision arithmetic accompanied by a slight reduction of the single precision exponent range (due to intermediate underflow and overflow at the far ends of the range). The basic algorithms were developed by T.J. Dekker and William Kahan in the 1970s. Below I list two fairly recent papers that show how these techniques can be adapted to GPUs, however much of the material covered in these papers is applicable independent of platform so should be useful for the task at hand.

https://hal.archives-ouvertes.fr/hal-00021443
Guillaume Da Graça, David Defour
Implementation of float-float operators on graphics hardware,
7th conference on Real Numbers and Computers, RNC7.

http://andrewthall.org/papers/df64_qf128.pdf
Andrew Thall
Extended-Precision Floating-Point Numbers for GPU Computation.

How to emulate double precision using two floats in OpenGL ES?

First the basic principle that is used to deal with this. Once you add or substract numbers with high exponent difference the result gets rounded:

12345600000 + 8.76542995683848E-4 = 12345600000

Now as I showed you in here we can store our numbers as sum of more floats for example vec2, vec3, vec4 which are still floats but together can combine to bigger overall mantissa bitwidth. The link in your question is not using exponent ranges like I did but uses difference between rounded and not rounded results. However the linked lib uses just vec2 which is still much less precise than native 64 bit double as mantissa of double has 53 bits and float has just 24 bits so 24+24 = 48 < 53 that is why I decided to use vec3. Now the trick is to obtain the rounding error. For the same example as above:

a=12345600000 
b=8.76542995683848E-4
c=a+b=12345600000

the a,b are float operands for + operation and c is rounded result. so the difference e can be obtained like this:

e=c-a; // e= 0
e-=b; // e=-8.76542995683848E-4
e=-e; // e=+8.76542995683848E-4

where e is the stuff that should be added to c in order to get not rounded result.

Now if we store some part of number in each component of vec3 then we can try to add this e to all of them (always removing added part from e) until e is zero.

So if c.x+e does round we add it to c.y and so on... Based on this I managed to compose this:

//---------------------------------------------------------------------------
//--- High Precision float ver: 1.000 ---------------------------------------
//---------------------------------------------------------------------------
#ifndef _GLSL_HP32
#define _GLSL_HP32
//---------------------------------------------------------------------------
// helper functions (internals)
void hp32_nor(vec3 &c) // bubble sort c coordinates desc by magnitude
{
float x;
if (fabs(c.x)<fabs(c.y)){ x=c.x; c.x=c.y; c.y=x; }
if (fabs(c.x)<fabs(c.z)){ x=c.x; c.x=c.z; c.z=x; }
if (fabs(c.y)<fabs(c.z)){ x=c.y; c.y=c.z; c.z=x; }
}
void hp32_err(vec3 &c,vec3 &e) // c+=e; apply rounding error e corection to c
{
float q;
q=c.x; c.x+=e.x; e.x=e.x-(c.x-q);
q=c.x; c.x+=e.y; e.y=e.y-(c.x-q);
q=c.x; c.x+=e.z; e.z=e.z-(c.x-q);
q=c.y; c.y+=e.x; e.x=e.x-(c.y-q);
q=c.y; c.y+=e.y; e.y=e.y-(c.y-q);
q=c.y; c.y+=e.z; e.z=e.z-(c.y-q);
q=c.z; c.z+=e.x; e.x=e.x-(c.z-q);
q=c.z; c.z+=e.y; e.y=e.y-(c.z-q);
q=c.z; c.z+=e.z; e.z=e.z-(c.z-q);
hp32_nor(c);
}
void hp32_split(vec3 &h,vec3 &l,vec3 a) // (h+l)=a; split mantissas to half
{
const float n=8193.0; // 10000000000001 bin uses ~half of mantissa bits
h=a*n; // this shifts the a left by half of mantissa (almost no rounding yet)
l=h-a; // this will round half of mantissa as h,a have half of mantisa bits exponent difference
h-=l; // this will get rid of the `n*` part from number leaving just high half of mantisa from original a
l=a-h; // this is just the differenc ebetween original a and h ... so lower half of mantisa beware might change sign
}
//---------------------------------------------------------------------------
// double api (comment it out if double not present)
vec3 hp32_set(double a) // double -> vec2
{
vec3 c;
c.x=a; a-=c.x;
c.y=a; a-=c.y;
c.z=a; hp32_nor(c);
return c;
}
double hp32_getl(vec3 a){ double c; c=a.z+a.y; c+=a.x; return c; } // vec2 -> double
//---------------------------------------------------------------------------
// normal api
vec3 hp32_set(float a){ return vec3(a,0.0,0.0); } // float -> vec2
float hp32_get(vec3 a){ float c; c=a.z+a.y; c+=a.x; return c; } // vec2 -> float
vec3 hp32_add(vec3 a,vec3 b) // = a+b
{
// c=a+b; addition
vec3 c=a+b,e; float q;
// e=(a+b)-c; rounding error
c.x=a.x+b.x; e.x=c.x-a.x; e.x-=b.x;
c.y=a.y+b.y; e.y=c.y-a.y; e.y-=b.y;
c.z=a.z+b.z; e.z=c.z-a.z; e.z-=b.z;
e=-e; hp32_err(c,e);
return c;
}
vec3 hp32_sub(vec3 a,vec3 b) // = a-b
{
// c=a-b; substraction
vec3 c=a-b,e; float q;
// e=(a-b)-c; rounding error
c.x=a.x+b.x; e.x=c.x-a.x; e.x+=b.x;
c.y=a.y+b.y; e.y=c.y-a.y; e.y+=b.y;
c.z=a.z+b.z; e.z=c.z-a.z; e.z+=b.z;
e=-e; hp32_err(c,e);
return c;
}
vec3 hp32_mul_half(vec3 a,vec3 b) // = a*b where a,b are just half of mantissas !!! internal call do not use this !!!
{
// c = (a.x+a.y+a.z)*(b.x+b.y+b.z) // multiplication of 2 expresions
// c = (a.x*b.x)+(a.x*b.y)+(a.x*b.z) // expanded
// +(a.y*b.x)+(a.y*b.y)+(a.y*b.z)
// +(a.z*b.x)+(a.z*b.y)+(a.z*b.z)
// c = (a.x*b.x) // ordered desc by magnitude (x>=y>=z)
// +(a.x*b.y)+(a.y*b.x)
// +(a.x*b.z)+(a.z*b.x)+(a.y*b.y)
// +(a.y*b.z)+(a.z*b.y)
// +(a.z*b.z)
vec3 c,e,f; float q,r;
// c=a*b; (e,f)=(a*b)-c; multiplication
c.x=(a.x*b.x);
r=(a.x*b.y); q=c.x; c.x+=r; e.x=r-(c.x-q);
r=(a.y*b.x); q=c.x; c.x+=r; e.y=r-(c.x-q);
c.y=(a.x*b.z);
r=(a.z*b.x); q=c.y; c.y+=r; e.z=r-(c.y-q);
r=(a.y*b.y); q=c.y; c.y+=r; f.x=r-(c.y-q);
c.z=(a.y*b.z);
r=(a.z*b.y); q=c.z; c.z+=r; f.y=r-(c.z-q);
r=(a.z*b.z); q=c.z; c.z+=r; f.z=r-(c.z-q);
e=+hp32_add(e,f); hp32_err(c,e);
return c;
}
vec3 hp32_mul(vec3 a,vec3 b) // = a*b
{
vec3 ah,al,bh,bl,c;
// split operands to halves of mantissa
hp32_split(ah,al,a);
hp32_split(bh,bl,b);
// c = (ah+al)*(bh+bl) = ah*bh + ah*bl + al*bh + al*bl
c= hp32_mul_half(ah,bh) ;
c=hp32_add(c,hp32_mul_half(ah,bl));
c=hp32_add(c,hp32_mul_half(al,bh));
c=hp32_add(c,hp32_mul_half(al,bl));
return c;
}
//---------------------------------------------------------------------------
#endif
//---------------------------------------------------------------------------

For now I tested this only on CPU side (C++). In order to use it in GLSL Just comment out or remove the double api functions Which I used to verify accuracy. And change the fabs to abs or declare:

float fabs(float x){ return abs(x); }

Again I have some normalization function hp32_nor which sorts the components by magnitude so fabs(x)>=fabs(y)>=fabs(z) which is needed to get back to float and multiplication. The +,- does not need it.

The hp32_err is like addition between normal number and rounding error difference (its a bit horrible but it preserves accuracy as much as it can) described above.

I did not test this extensively yet!!! Looks like +,-,* operations are precise in comparison with double.

The multiplication implementation is a bit complex because a*b on floats has the result bitwidth of mantissa as sum of operands bitwidths. So to avoid rounding we need to first split the operands into halves. that can be done like this (analysed from the lib you linked):

// split float into h,l
float a,h,l,n;
n=8193.0; // n = 10000000000001.00000000000000000000b
a=123.4567; // a = 1111011.01110100111010101000b
h=a*n; // h = 11110110111100011000.11000000000000000000b
l=h-a; // l = 11110110111010011101.01010000000000000000b
h-=l; // h = 1111011.01110000000000000000b
l=a-h; // l = 0.00000100111010101000b

so float has 24 bits of mantisa and 8193 is (24/2)+1=13 bits wide. So once you multiply any float with it the result needs about half of mantissa bits more than present and get round off. The its just a matter of getting back to original operand scale and getting the other half as difference between new half and original float value. All of this is done in helper function hp32_split.

Now the multiplication c=a*b on halves look like this:

c = (ah+al)*(bh+bl) = ah*bh + ah*bl + al*bh + al*bl

And each half multiplication a?*b? loos like this:

c = (a.x+a.y+a.z)*(b.x+b.y+b.z)     // multiplication of 2 expresions
c = (a.x*b.x)+(a.x*b.y)+(a.x*b.z) // expanded
+(a.y*b.x)+(a.y*b.y)+(a.y*b.z)
+(a.z*b.x)+(a.z*b.y)+(a.z*b.z)
c = (a.x*b.x) // ordered desc by magnitude (x>=y>=z)
+(a.x*b.y)+(a.y*b.x)
+(a.x*b.z)+(a.z*b.x)+(a.y*b.y)
+(a.y*b.z)+(a.z*b.y)
+(a.z*b.z)

so I divide it to 3 bracket terms per each component of c. Its important to sort the terms by magnitude to prevent rounding errors as much as possible. Along the summing of terms I just accumulate the error similarly like in addition.

Emulating FP64 with 2 FP32 on a GPU

You can get a rough estimate of the performance by counting the number of float operations required to implement each double-float operation. You would want to inspect binary code with cuobjdump --dump-sass to get an accurate count. I am showing a double-float multiplication below that takes full advantage of FMA (fused multiply-add) support on the GPU. For double-float addition code, I would point you to a paper by Andrew Thall as I do not have the time to code this up right now. From previous analysis I believe the addition code given in the paper is correct, and that it avoids common pitfalls in faster but less accurate implementations (which lose accuracy when the magnitude of the operands is within a factor of two).

If you are a registered CUDA developer you can download double-double code from NVIDIA's developer website (log in at https://developer.nvidia.com) which is under BSD license, and rework it relatively quickly into double-float code. NVIDIA's double-double code supports the operations addition, subtraction, division, square root, and reciprocal square root.

As you can see, the multiplication below requires 8 float instructions; unary negation is absorbed into FMA. The addition requires around 20 float instructions. However, the instruction sequences for double-float operations also require temporary variables, which increases register pressure and can decrease occupancy. A reasonably conservative estimate may therefore be that double-float arithmetic performs at 1/20 the throughput of native float arithmetic. You can easily measure this yourself, in the context relevant to you, i.e. your use case(s).

typedef float2 dblfloat;  // .y = head, .x = tail

__host__ __device__ __forceinline__
dblfloat mul_dblfloat (dblfloat x, dblfloat y)
{
dblfloat t, z;
float sum;
t.y = x.y * y.y;
t.x = fmaf (x.y, y.y, -t.y);
t.x = fmaf (x.x, y.x, t.x);
t.x = fmaf (x.y, y.x, t.x);
t.x = fmaf (x.x, y.y, t.x);
/* normalize result */
sum = t.y + t.x;
z.x = (t.y - sum) + t.x;
z.y = sum;
return z;
}

Note that in various applications, full double-float arithmetic may not be necessary. Instead one can use float computation, augmented by error compensating techniques, one of the oldest of which is the Kahan summation. I gave a brief overview of easily available literature on such methods in a recent posting in the NVIDIA developer forums. In the comments above, Robert Crovella also pointed to a GTC 2015 talk by Scott LeGrand, which I haven't had time to check out yet.

As for accuracy, double-float has a representational precision of 49 (24+24+1) bits, compared with IEEE-755 double which provides 53 bits. However double-float cannot maintain this precision for operands small in magnitude, as the tail portion can become a denormal or zero. When denormal support is turned on, the 49 bits of precision are guaranteed for 2-101 <= |x| < 2128. Denormal support for float is turned on by default in the CUDA tool chain for architectures >= sm_20, which means all architectures supported by the currently shipping version, CUDA 7.0.

As opposed to operations on IEEE-754 double data, double-float operations are not correctly rounded. For the double-float multiplication above, using 2 billion random test cases (with all source operands and results within the bounds stated above), I observed an upper bound of 1.42e-14 for the relative error. I do not have data for the double-float addition, but its error bound should be similar.

Simple C example of add/sub/mul/div operations in double-precision floating-points using a single-precision Floating-point system

I was able to implement this code without using double precision as the calculations were in the range of Float.
Here's my implementation, let me know if I can optimize it better.

typedef struct
{ int64_t tc0;
int64_t tc1;
int64_t tc2;
int64_t tc3;
int64_t tc4;
}POLYNOMIALS;

POLYNOMIALS polynomials = {0,0,0,0,0};
int16_t TempCompIndex;
int64_t x[TC_TABLE_SIZE];
int64_t y[TC_TABLE_SIZE];

float matrix[POLYNOMIAL_ORDER+1][POLYNOMIAL_ORDER+1] = {0};
float average[POLYNOMIAL_ORDER+1] = {0};

void TComp_Polynomial()
{
int i;
int j;
int k;
int size;
float temp;
float sum = 0;
float powr = 0;
float prod;

int64_t x[TC_TABLE_SIZE];
int64_t y[TC_TABLE_SIZE];

for(i = 0; i < TC_TABLE_SIZE; i++)
{
x[i] = (int64_t) TCtable[i].Temp;
y[i] = (int64_t) TCtable[i].Comp<<PRECISION;
printf("x: %lld, y:%lld\n",x[i],y[i]);
}

size = i;

for(i = 0; i <= POLYNOMIAL_ORDER; i++)
{
for(j = 0; j <= POLYNOMIAL_ORDER; j++)
{
sum = 0;
powr = 0;
for(k = 0; k < size; k++)
{
//printf("x[%d]: %ld, i: %d ,j: %d ", k, x[k],i,j);
powr = pow(x[k],i+j);
//printf("Power: %f, sum: %f\n ",powr,sum);
sum += powr;
//printf("%f\r\n",powr);
//printf("sum: %lf\n",sum );
}

matrix[i][j] = sum;
printf("sum: %g\n",sum);
}
}

for(i = 0; i <= POLYNOMIAL_ORDER; i++)
{
sum = 0;
powr = 0;

for(j = 0; j < size; j++)
{
//sum += y[j] * pow(x[j],i)
//printf("sum: %lf, y[%d]: %lf, x[%d]: %lf^%d ",sum,j,y[j], i, x[j],i);
//printf("x[%d]:%lld ^ %d\t",j,x[j],i);
powr = (float) pow(x[j],i);
printf("powr: %f\t",powr);

prod = (float) y[j] * powr;
printf("prod:%f \t %lld \t", prod,y[j]);

sum += (float) prod;
printf("sum: %f \n",sum);
}

average[i] = sum;
//printf("#Avg: %f\n",average[i]);
}
printf("\n\n");

for(i = 0; i <= POLYNOMIAL_ORDER; i++)
{
for(j = 0; j <= POLYNOMIAL_ORDER; j++)
{
if(j != i)
{
if(matrix[i][i]!= 0)
{
//printf("matrix%d%d: %g / matrix%d%d: %g =\t ",j,i,matrix[j][i],i,i,matrix[i][i]);
temp = matrix[j][i]/matrix[i][i];
//printf("Temp: %g\n",temp);
}

for(k = i; k < POLYNOMIAL_ORDER; k++)
{
matrix[j][k] -= temp*matrix[i][k];
//printf("matrix[%d][%d]:%g, %g, matrix[%d][%d]:%g\n",j,k,matrix[j][k], temp,i,k,matrix[i][k]);
}
//printf("\n\n");
//print_matrix();
printf("\n\n");

//printf("avg%d: %g\ttemp: %g\tavg%d: %g\n\n",j,average[j],temp,i,average[i]);
average[j] -= temp*average[i];
printf("#Avg%d:%g\n",j,average[j]);
//print_average();
}
}
}

print_matrix();
print_average();




/* Calculate polynomial Coefficients (n+1) based on the POLYNOMIAL_ORDER (n) */
#ifndef POLYNOMIAL_ORDER

#elif POLYNOMIAL_ORDER == 0
if(matrix[0][0] != 0)
{
polynomials.tc0 = (int64_t) (average[0]/matrix[0][0]);
}
#elif POLYNOMIAL_ORDER == 1
if(matrix[1][1] != 0)
{
polynomials.tc0 = (int64_t) (average[0]/matrix[0][0]);
polynomials.tc1 = (int64_t) (average[1]/matrix[1][1]);
}
#elif POLYNOMIAL_ORDER == 2
if(matrix[2][2] != 0)
{
polynomials.tc0 = (int64_t) (average[0]/matrix[0][0]);
polynomials.tc1 = (int64_t) (average[1]/matrix[1][1]);
polynomials.tc2 = (int64_t) (average[2]/matrix[2][2]);
}
#elif POLYNOMIAL_ORDER == 3
if(matrix[3][3] != 0)
{
polynomials.tc0 = (int64_t) (average[0]/matrix[0][0]);
polynomials.tc1 = (int64_t) (average[1]/matrix[1][1]);
polynomials.tc2 = (int64_t) (average[2]/matrix[2][2]);
polynomials.tc3 = (int64_t) (average[3]/matrix[3][3]);
}
#elif POLYNOMIAL_ORDER == 4
if(matrix[4][4] != 0)
{
polynomials.tc0 = (int64_t) (average[0]/matrix[0][0]);
polynomials.tc1 = (int64_t) (average[1]/matrix[1][1]);
polynomials.tc2 = (int64_t) (average[2]/matrix[2][2]);
polynomials.tc3 = (int64_t) (average[3]/matrix[3][3]);
polynomials.tc4 = (int64_t) (average[4]/matrix[4][4]);
}
#endif

}



int16_t calculate_equation(uint16_t TEMP)
{
int64_t Y = 0;
int16_t TempComp = 0;

#ifndef POLYNOMIAL_ORDER
#elif POLYNOMIAL_ORDER == 0
Y = polynomials.tc0;
#elif POLYNOMIAL_ORDER == 1
Y = polynomials.tc1* ((int64_t)TEMP) + polynomials.tc0;
#elif POLYNOMIAL_ORDER == 2
Y = (polynomials.tc2 * ((int64_t)TEMP) + polynomials.tc1)*(int64_t)TEMP + polynomials.tc0;
#elif POLYNOMIAL_ORDER == 3
Y = ((polynomials.tc3 * ((int64_t)TEMP) + polynomials.tc2)*((int64_t)TEMP) + polynomials.tc1)*((int64_t)TEMP) + polynomials.tc0;
#elif POLYNOMIAL_ORDER == 4
Y = (((polynomials.tc4 * (int64_t)TEMP + polynomials.tc3)*(int64_t)TEMP + polynomials.tc2)*(int64_t)TEMP + polynomials.tc1)*(int64_t)TEMP + polynomials.tc0;
#endif
TempComp = (int16_t) (Y>>PRECISION_BITS);

return TempComp;
}

void main(){
int16_t TempComp = 0;
TempCompValue = (int16_t) calculate_equation(Mon_Temp);
}

Note: Calculate_Equation() is being called once a second and it is required to not use float in order to avoid floating point arithmetic, hence I am using non-float variables in that function.

It is working right for me and haven't discovered any bug after initial testing.
Thanks every one for taking interest in my post, if not the answer, got to learn some new techniques. And thanks @chux.

How to simulate Single precision rounding with Doubles?

You want to use the library functions frexp and ldexp, which are standard C99 functions, and are available in Lua.

frexp takes a floating point number and separates the mantissa from the exponent. The resulting mantissa is either 0 or in one of the ranges [0.5, 1.0) or (-1.0, 0.5]. You can then remove any extra bits in the obvious way (floor(mantissa * 2^k)/2^k for non-negative values, for example). (Edited to add:) It would be better to subtract k from the exponent in the call to ldexp than to do the divide as shown, because I'm pretty sure that Lua doesn't guarantee that 2^k is precise.

ldexp is the inverse of frexp; you can use that to put the truncated number back together again.

I have no idea how to do this in Excel. Check the manual :) (Edited to add:) I suppose you could get roughly the same effect by dividing the number by 2 to the power of the ceiling of the log 2 of the number, and then doing the binary round as indicated above, and then reversing the process to recreate the original exponent. But I suspect the results would occasionally run into peculiarities with Excel's peculiar ideas about arithmetic.

Using two uint's to represent a double, then multiply?

If you want to replace double precision, you could look into using a "double single" representation where the operands are pairs of single-precision numbers refered to as the "head" and the "tail", and which satisfy the normalization requirement that |tail| <= 0.5 * ulp (|head|). CUDA's float2 is the appropriate type to hold the pairs of floats needed. For example, the less signficant x-component would represent the "tail", and the more significant y-component the "head".

Note that this does not provide exactly the same accuracy as double precision. The precision of this representation is limited to 49 bits (24 bits from each of the single-precision mantissa plus 1 bit the sign bit of the tail) vs 53 for double precision. The operations won't provide IEEE rounding, and the range may also be reduced somewhat due to overflow of temporary quantities inside the multiplication code. Subnormal quantities may also not be accurately representable.

I don't think the performance will be significantly better than using the native double-precision operations of your GPU, as register pressure will be higher and the number of single-precision operations required for each "double single" operation is fairly substantial. You could of course just try and measure the performance for both variants.

Below is an example of an addition in "double single" format. The source of the algorithm is noted in the comment. I believe that the cited work is in turn based on the work of D. Priest who did research in this subject matter in the late 1980s or early 1990s. For a worked example of a "double single" multiplication, you could take a look at the function __internal_dsmul() in the file math_functions.h that ships with CUDA.

A quick remark on 64-bit integer operations in CUDA (as one commenter noted this as a potential alternative). Current GPU hardware has very limited support for native 64-bit integer operations, basically providing conversion from and to floating-point types, and indexing using 64-bit addresses in loads and stores. Otherwise 64-bit integer operations are implemented via native 32-bit operations, as one can easily see by looking at disassembled code (cuobjdump --dump-sass).

/* Based on: Andrew Thall, Extended-Precision Floating-Point Numbers for GPU 
Computation. Retrieved from http://andrewthall.org/papers/df64_qf128.pdf
on 7/12/2011.
*/
__device__ float2 dsadd (float2 a, float2 b)
{
float2 z;
float t1, t2, t3, t4, t5, e;

t1 = __fadd_rn (a.y, b.y);
t2 = __fadd_rn (t1, -a.y);
t3 = __fadd_rn (__fadd_rn (a.y, t2 - t1), __fadd_rn (b.y, -t2));
t4 = __fadd_rn (a.x, b.x);
t2 = __fadd_rn (t4, -a.x);
t5 = __fadd_rn (__fadd_rn (a.x, t2 - t4), __fadd_rn (b.x, -t2));
t3 = __fadd_rn (t3, t4);
t4 = __fadd_rn (t1, t3);
t3 = __fadd_rn (t1 - t4, t3);
t3 = __fadd_rn (t3, t5);
z.y = e = __fadd_rn (t4, t3);
z.x = __fadd_rn (t4 - e, t3);
return z;
}


Related Topics



Leave a reply



Submit