/* * JOCL - Java bindings for OpenCL * * Copyright 2009 Marco Hutter - http://www.jocl.org/ */ // Quad-Float functions for OpenCL float4 type. // Ported from quad-double (QD) package: // http://crd.lbl.gov/~dhbailey/mpdist/index.html inline float4 qfAssign(float value) { return (float4)(value, 0.0f, 0.0f, 0.0f); } inline float4 qfAssign2(float2 value) { return (float4)(value.x, value.y, 0.0f, 0.0f); } inline float4 qfNegate(float4 value) { return (float4)(-value.x, -value.y, -value.z, -value.w); } inline float two_sum(float a, float b, float *err) { float s = a + b; float bb = s - a; *err = (a - (s - bb)) + (b - bb); return s; } inline void three_sum(float *a, float *b, float *c) { float t1, t2, t3; t1 = two_sum(*a, *b, &t2); *a = two_sum(*c, t1, &t3); *b = two_sum(t2, t3, c); } inline void three_sum2(float *a, float *b, float *c) { float t1, t2, t3; t1 = two_sum(*a, *b, &t2); *a = two_sum(*c, t1, &t3); *b = t2 + t3; } inline float quick_two_sum(float a, float b, float *err) { float s = a + b; *err = b - (s - a); return s; } inline void renorm(float *c0, float *c1, float *c2, float *c3, float *c4) { float s0, s1, s2 = 0.0f, s3 = 0.0f; s0 = quick_two_sum(*c3, *c4, c4); s0 = quick_two_sum(*c2, s0, c3); s0 = quick_two_sum(*c1, s0, c2); *c0 = quick_two_sum(*c0, s0, c1); s0 = *c0; s1 = *c1; s0 = quick_two_sum(*c0, *c1, &s1); if (s1 != 0.0f) { s1 = quick_two_sum(s1, *c2, &s2); if (s2 != 0.0f) { s2 = quick_two_sum(s2, *c3, &s3); if (s3 != 0.0f) { s3 += *c4; } else { s2 += *c4; } } else { s1 = quick_two_sum(s1, *c3, &s2); if (s2 != 0.0f) { s2 = quick_two_sum(s2, *c4, &s3); } else { s1 = quick_two_sum(s1, *c4, &s2); } } } else { s0 = quick_two_sum(s0, *c2, &s1); if (s1 != 0.0f) { s1 = quick_two_sum(s1, *c3, &s2); if (s2 != 0.0f) { s2 = quick_two_sum(s2, *c4, &s3); } else { s1 = quick_two_sum(s1, *c4, &s2); } } else { s0 = quick_two_sum(s0, *c3, &s1); if (s1 != 0.0f) { s1 = quick_two_sum(s1, *c4, &s2); } else { s0 = quick_two_sum(s0, *c4, &s1); } } } *c0 = s0; *c1 = s1; *c2 = s2; *c3 = s3; } inline void qfAdd(float4 *sum, const float4 a, const float4 b) { float s0, s1, s2, s3; float t0, t1, t2, t3; s0 = two_sum(a.x, b.x, &t0); s1 = two_sum(a.y, b.y, &t1); s2 = two_sum(a.z, b.z, &t2); s3 = two_sum(a.w, b.w, &t3); s1 = two_sum(s1, t0, &t0); three_sum(&s2, &t0, &t1); three_sum2(&s3, &t0, &t2); t0 = t0 + t1 + t3; renorm(&s0, &s1, &s2, &s3, &t0); (*sum).x = s0; (*sum).y = s1; (*sum).z = s2; (*sum).w = s3; } inline void split(float a, float *hi, float *lo) { float temp = ((1<<12)+1) * a; *hi = temp - (temp - a); *lo = a - *hi; } inline float two_prod(float a, float b, float *err) { float a_hi, a_lo, b_hi, b_lo; float p = a * b; split(a, &a_hi, &a_lo); split(b, &b_hi, &b_lo); *err = ((a_hi * b_hi - p) + a_hi * b_lo + a_lo * b_hi) + a_lo * b_lo; return p; } inline void qfMul(float4 *prod, const float4 a, const float4 b) { float p0, p1, p2, p3, p4, p5; float q0, q1, q2, q3, q4, q5; float t0, t1; float s0, s1, s2; p0 = two_prod(a.x, b.x, &q0); p1 = two_prod(a.x, b.y, &q1); p2 = two_prod(a.y, b.x, &q2); p3 = two_prod(a.x, b.z, &q3); p4 = two_prod(a.y, b.y, &q4); p5 = two_prod(a.z, b.x, &q5); three_sum(&p1, &p2, &q0); three_sum(&p2, &q1, &q2); three_sum(&p3, &p4, &p5); s0 = two_sum(p2, p3, &t0); s1 = two_sum(q1, p4, &t1); s2 = q2 + p5; s1 = two_sum(s1, t0, &t0); s2 += (t0 + t1); s1 += a.x*b.w + a.y*b.z + a.z*b.y + a.w*b.x + q0 + q3 + q4 + q5; renorm(&p0, &p1, &s0, &s1, &s2); (*prod).x = p0; (*prod).y = p1; (*prod).z = p2; (*prod).w = p3; } inline void qfMulFloat(float4 *prod, const float4 a, const float b) { float p0, p1, p2, p3; float q0, q1, q2; float s0, s1, s2, s3, s4; p0 = two_prod(a.x, b, &q0); p1 = two_prod(a.y, b, &q1); p2 = two_prod(a.z, b, &q2); p3 = a.w * b; s0 = p0; s1 = two_sum(q0, p1, &s2); three_sum(&s2, &q1, &p2); three_sum2(&q1, &q2, &p3); s3 = q1; s4 = q2 + p2; renorm(&s0, &s1, &s2, &s3, &s4); (*prod).x = s0; (*prod).y = s1; (*prod).z = s2; (*prod).w = s3; } inline bool qfLessThan(float4 *a, float b) { return ((*a).x < b || ((*a).x == b && (*a).y < 0.0f)); } inline void renorm4(float *c0, float *c1, float *c2, float *c3) { float s0, s1, s2 = 0.0f, s3 = 0.0f; s0 = quick_two_sum(*c2, *c3, c3); s0 = quick_two_sum(*c1, s0, c2); *c0 = quick_two_sum(*c0, s0, c1); s0 = *c0; s1 = *c1; if (s1 != 0.0f) { s1 = quick_two_sum(s1, *c2, &s2); if (s2 != 0.0f) { s2 = quick_two_sum(s2, *c3, &s3); } else { s1 = quick_two_sum(s1, *c3, &s2); } } else { s0 = quick_two_sum(s0, *c2, &s1); if (s1 != 0.0f) { s1 = quick_two_sum(s1, *c3, &s2); } else { s0 = quick_two_sum(s0, *c3, &s1); } } *c0 = s0; *c1 = s1; *c2 = s2; *c3 = s3; } float4 qfDiv(const float4 a, const float4 b) { float q0, q1, q2, q3; float4 r; float4 p; q0 = a.x / b.x; // r = a - (b * q0); qfMulFloat(&p, b, q0); p = qfNegate(p); qfAdd(&r, a, p); q1 = r.x / b.x; // r -= (b * q1); qfMulFloat(&p, b, q1); p = qfNegate(p); qfAdd(&r, r, p); q2 = r.x / b.x; //r -= (b * q2); qfMulFloat(&p, b, q2); p = qfNegate(p); qfAdd(&r, r, p); q3 = r.x / b.x; renorm4(&q0, &q1, &q2, &q3); return (float4)(q0, q1, q2, q3); }