glibc
(more precise, libm
) and many other C libraries contain the following software implementation of the atan()
function (see here):
/* s_atanf.c -- float version of s_atan.c.
* Conversion to float by Ian Lance Taylor, Cygnus Support, ian@cygnus.com.
*/
/*
* ====================================================
* Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
*
* Developed at SunPro, a Sun Microsystems, Inc. business.
* Permission to use, copy, modify, and distribute this
* software is freely granted, provided that this notice
* is preserved.
* ====================================================
*/
#if defined(LIBM_SCCS) && !defined(lint)
static char rcsid[] = "$NetBSD: s_atanf.c,v 1.4 1995/05/10 20:46:47 jtc Exp $";
#endif
#include <float.h>
#include <math.h>
#include <math_private.h>
#include <math-underflow.h>
#include <libm-alias-float.h>
static const float atanhi[] = {
4.6364760399e-01, /* atan(0.5)hi 0x3eed6338 */
7.8539812565e-01, /* atan(1.0)hi 0x3f490fda */
9.8279368877e-01, /* atan(1.5)hi 0x3f7b985e */
1.5707962513e+00, /* atan(inf)hi 0x3fc90fda */
};
static const float atanlo[] = {
5.0121582440e-09, /* atan(0.5)lo 0x31ac3769 */
3.7748947079e-08, /* atan(1.0)lo 0x33222168 */
3.4473217170e-08, /* atan(1.5)lo 0x33140fb4 */
7.5497894159e-08, /* atan(inf)lo 0x33a22168 */
};
static const float aT[] = {
3.3333334327e-01, /* 0x3eaaaaaa */
-2.0000000298e-01, /* 0xbe4ccccd */
1.4285714924e-01, /* 0x3e124925 */
-1.1111110449e-01, /* 0xbde38e38 */
9.0908870101e-02, /* 0x3dba2e6e */
-7.6918758452e-02, /* 0xbd9d8795 */
6.6610731184e-02, /* 0x3d886b35 */
-5.8335702866e-02, /* 0xbd6ef16b */
4.9768779427e-02, /* 0x3d4bda59 */
-3.6531571299e-02, /* 0xbd15a221 */
1.6285819933e-02, /* 0x3c8569d7 */
};
static const float
one = 1.0,
huge = 1.0e30;
float __atanf(float x)
{
float w,s1,s2,z;
int32_t ix,hx,id;
GET_FLOAT_WORD(hx,x);
ix = hx&0x7fffffff;
if(ix>=0x4c000000) { /* if |x| >= 2^25 */
if(ix>0x7f800000)
return x+x; /* NaN */
if(hx>0) return atanhi[3]+atanlo[3];
else return -atanhi[3]-atanlo[3];
} if (ix < 0x3ee00000) { /* |x| < 0.4375 */
if (ix < 0x31000000) { /* |x| < 2^-29 */
math_check_force_underflow (x);
if(huge+x>one) return x; /* raise inexact */
}
id = -1;
} else {
x = fabsf(x);
if (ix < 0x3f980000) { /* |x| < 1.1875 */
if (ix < 0x3f300000) { /* 7/16 <=|x|<11/16 */
id = 0; x = ((float)2.0*x-one)/((float)2.0+x);
} else { /* 11/16<=|x|< 19/16 */
id = 1; x = (x-one)/(x+one);
}
} else {
if (ix < 0x401c0000) { /* |x| < 2.4375 */
id = 2; x = (x-(float)1.5)/(one+(float)1.5*x);
} else { /* 2.4375 <= |x| < 2^66 */
id = 3; x = -(float)1.0/x;
}
}}
/* end of argument reduction */
z = x*x;
w = z*z;
/* break sum from i=0 to 10 aT[i]z**(i+1) into odd and even poly */
s1 = z*(aT[0]+w*(aT[2]+w*(aT[4]+w*(aT[6]+w*(aT[8]+w*aT[10])))));
s2 = w*(aT[1]+w*(aT[3]+w*(aT[5]+w*(aT[7]+w*aT[9]))));
if (id<0) return x - x*(s1+s2);
else {
z = atanhi[id] - ((x*(s1+s2) - atanlo[id]) - x);
return (hx<0)? -z:z;
}
}
libm_alias_float (__atan, atan)
I am understanding how that function works, except for the values in the aT[]
array. The actual work happens after the argument reduction in the following lines:
z = x*x;
w = z*z;
/* break sum from i=0 to 10 aT[i]z**(i+1) into odd and even poly */
s1 = z*(aT[0]+w*(aT[2]+w*(aT[4]+w*(aT[6]+w*(aT[8]+w*aT[10])))));
s2 = w*(aT[1]+w*(aT[3]+w*(aT[5]+w*(aT[7]+w*aT[9]))));
if (id<0) return x - x*(s1+s2);
/* ... */
If we carry out the multiplications and the subtraction in the last line, we get the power series of atan()
: x - 1/3 * x^3 + 1/5 * x^5 - 1/7 * x^7 ...
. However, the coefficients in the aT[]
array are different from the power series' coefficients:
The first one is as expected in the sense that it is the float nearest to 1/3
. The next ones more and more deviate from the expected values, each one more than its predecessor, until the last one, which is expected to be 1/23
(4.347826e-2
if truncated to float precision), but actually is 1.6285819933e-02
.
Why do the coefficients deviate so much from the expected values? Is this a mathematical trick to mitigate the fact that the power series has to be aborted at some point (in this case, at x^23
)?
As already alluded to in comments, the coefficients are those of a polynomial minimax approximation P, here atan(x) = x - x3 · P(x2). Such an approximation minimizes the maximum error over a particular approximation interval [a, b), here [0, 7/16).
A polynomial minimax approximation has particular properties. The difference function d(x) = f(x) - P(x) has n+2 extrema for a polynomial of degree n, and neighboring extrema have opposite sign. This is the so-called equioscillation property. There will be one extremum each at a and b. In general, coefficients for a minimax approximation have to be determined numerically. The most commonly used procedure for this is called the Remez algorithm 1, 2, due to the Soviet mathematician Evgeny Remez. Software packages like Mathematica or Maple have facilities for this, as does the open-source Sollya tool.
The basic idea is simple: Choose n+2 values xi and set up n+2 equations P(xi) ± d = f(xi), where the sign of d alternates between consecutive i. This results in a system of n+2 equations in n+2 unknowns: n+1 coefficients plus d. The task then is to choose the initial set of xi and manipulate them until d is minimized. While there is one unique minimax approximation mathematically, a numerical procedure is going to deliver a near-minimax approximation that can be arbitrarily close. It is highly advisable to perform these intermediate calculations with much higher precision than the target precision of the coefficients.
When I generate a polynomial approximation of degree ten (with relative error as the error criterion) using a custom implementation of the Remez algorithm from the early 1990s written in C, the following coefficients result:
3.3333334327e-01f, /* 0x3eaaaaab */
-2.0000000298e-01f, /* 0xbe4ccccd */
1.4285714924e-01f, /* 0x3e124925 */
-1.1111110449e-01f, /* 0xbde38e38 */
9.0908870101e-02f, /* 0x3dba2e6e */
-7.6918721199e-02f, /* 0xbd9d8790 */
6.6610343754e-02f, /* 0x3d886b01 */
-5.8333244175e-02f, /* 0xbd6eeed7 */
4.9759168178e-02f, /* 0x3d4bd045 */
-3.6510344595e-02f, /* 0xbd158bdf */
1.6265589744e-02f, /* 0x3c853f6a */
This is very close to the set of coefficients used in the code from the question. It should be noted that a 10th-degree polynomial is overkill here, as this can provide an approximation with a relative error < 1e-17, that is, sufficient for an IEEE-754 double-precision implementation. The reason for this is presumably that the code shown was derived from a double
implementation of atan()
.
For an implementation of atanf()
using float
computation mapped to IEEE-754 binary32
, a 4th-degree polynomial would be sufficient:
static const float aT[] = {
3.3333304524e-01f, /* 0x3eaaaaa1 */
-1.9997754693e-01f, /* 0xbe4cc6ea */
1.4229640365e-01f, /* 0x3e11b626 */
-1.0490779579e-01f, /* 0xbdd6d9e6 */
5.8191072196e-02f /* 0x3d6e59c3 */
};
s1 = z*(aT[0]+w*(aT[2]+w*aT[4]));
s2 = w*(aT[1]+w*aT[3]);
1 E. Remes, "Sur un procédé convergent d'approximations successives pour déterminer les polynomes d'approximation." Comptes rendus hebdomadaires des séances de l'Académie des sciences, 198, Jan.-Jun.1934, pp. 2063-2065 (presented at the session of June 4, 1934).
2 Eugène Remes, "Sur le calcul effectif des polynomes d'approximation de Tchebichef." Comptes rendus hebdomadaires des séances de l'Académie des sciences, 199, Jul.-Dec.1934, pp. 337-340 (presented at the session of July 23, 1934).