How to efficiently apply polynomials in C++ without a loop?

I want to get accurate approximations of some complex functions (pow, exp, log, log2...) faster than ones provided by cmath in C++'s standard library.

To do this I want to exploit the way the floating point is encoded and get exponent and mantissa using bit manipulations, and then do polynomial approximations. The mantissa is between 1 and 2 so I use nth order polynomial to approximate the target function in the domain x in [1, 2], and do bit manipulations and simple math to the float expression to make the calculations work.

I used np.polyfit to generate the polynomials. As an example, the following is the 7th order polynomial I use to approximate log2 on 1 <= x <= 2:

P = np.array(
    [
        0.01459855,
        -0.17811046,
        0.95074541,
        -2.91450247,
        5.67353733,
        -7.39616658,
        7.08511059,
        -3.23521156,
    ],
    dtype=float,
)

To apply the polynomial, sum the first term times x raised to 7th power and second term times x raised to 6th power, and so on...

In code:

P[0] * x**7 + P[1] * x**6 + P[2] * x**5 + P[3] * x**4 + P[4] * x**3 + P[5] * x**2 + P[6] * x + P[7]

Of course this is very inefficient, bigger powers are computed first, so there are a lot of duplicate calculations, if we reverse the order, we can calculate the current power from the previous power, like so:

PR = P[::-1]
s = 0
c = 1
for i in PR:
    s += i * c
    c *= x

And that is exactly what I do in C++:

constexpr double LOG2_POLY7[8] = {
    -3.23521156,
    7.08511059,
    -7.39616658,
    5.67353733,
    -2.91450247,
    0.95074541,
    -0.17811046,
    0.01459855,
};
constexpr float FU = 1.0 / (1 << 23);

inline float fast_log2_accurate(float f) {
    uint32_t bits = *reinterpret_cast<uint32_t*>(&f);
    int e = ((bits >> 23) & 0xff) - 127;
    double m1 = 1 + (bits & 0x7fffff) * FU;
    double s = 0;
    double m = 1;
    for (const double& p : LOG2_POLY7) {
        s += p * m;
        m *= m1;
    }
    return e + s;
}

It is much faster than log2 from cmath, while getting the same accuracy:

log2(3.1415927f) = 1.651496171951294 : 42.68856048583984 nanoseconds
fast_log2_accurate(3.1415927f) = 1.651496171951294 : 9.967899322509766 nanoseconds

I compiled with Visual Studio 2022, compiler flags:

/permissive- /ifcOutput "x64\Release\" /GS /GL /W3 /Gy /Zc:wchar_t /Zi /Gm- /O2 /Ob1 /sdl /Fd"x64\Release\vc143.pdb" /Zc:inline /fp:fast /D "NDEBUG" /D "_CONSOLE" /D "_UNICODE" /D "UNICODE" /errorReport:prompt /WX- /Zc:forScope /std:c17 /Gd /Oi /MD /std:c++20 /FC /Fa"x64\Release\" /EHsc /nologo /Fo"x64\Release\" /Ot /Fp"x64\Release\exponentiation.pch" /diagnostics:column /arch:AVX2

However I think it can be more efficient. There is a loop overhead, and if I can optimize the loop out, it should be faster.

How can I apply the polynomial without loop?

If the loop is already unrolled, then is it possible to do the calculation using SIMD instructions to make it even faster?

I have benchmarked the solutions provided below, and some other functions I wrote before:

#include <vector>
#include <numbers>
using std::vector;

using std::numbers;
using numbers::ln2;
using numbers::pi;

constexpr double LOG2_POLY7[8] = {
    -3.23521156,
    7.08511059,
    -7.39616658,
    5.67353733,
    -2.91450247,
    0.95074541,
    -0.17811046,
    0.01459855,
};
constexpr float FU = 1.0 / (1 << 23);

inline float fast_log2_accurate(float f) {
    uint32_t bits = std::bit_cast<uint32_t>(f);
    int e = ((bits >> 23) & 0xff) - 127;
    double m1 = 1 + (bits & 0x7fffff) * FU;
    double s = 0;
    double m = 1;
    for (const double& p : LOG2_POLY7) {
        s += p * m;
        m *= m1;
    }
    return e + s;
}

template <int N> inline double poly(const double* a, const float x) {
    return (a[0] + x * poly<N - 1>(a + 1, x));
}

template <> inline double poly<0>(const double* a, const float x) {
    return x * a[0];
}

inline float fast_log2_accurate2(float f) {
    uint32_t bits = std::bit_cast<uint32_t>(f);
    int e = ((bits >> 23) & 0xff) - 127;
    double m1 = 1 + (bits & 0x7fffff) * FU;
    return e + poly<8>(LOG2_POLY7, m1);
}


inline float fast_log2_accurate3(float f) {
    uint32_t bits = std::bit_cast<uint32_t>(f);
    int e = ((bits >> 23) & 0xff) - 127;
    double m1 = 1 + (bits & 0x7fffff) * FU;
    double s = 0;
    double m = 1;
    for (int i = 0; i < 8; i++) {
        s += LOG2_POLY7[i] * m;
        m *= m1;
    }
    return e + s;
}

vector<float> log2_float() {
    int lim = 1 << 24;
    vector<float> table(lim);
    for (int i = 0; i < lim; i++) {
        table[i] = float(log(i) / ln2) - 150;
    }
    return table;
}
const vector<float> LOG2_FLOAT = log2_float();

inline float fast_log2(float f) {
    uint32_t bits = std::bit_cast<uint32_t>(f);
    int e = (bits >> 23) & 0xff;
    int m = bits & 0x7fffff;
    return (e == 0 ? LOG2_FLOAT[m << 1] : e + LOG2_FLOAT[m | 0x00800000]);
}

inline float fast_log(float f) {
    return fast_log2(f) * ln2;
}

vector<double> log2_double() {
    int lim = 1 << 24;
    vector<double> table(lim);
    for (uint64_t i = 0; i < lim; i++) {
        table[i] = log(i << 29) / ln2 - 1075;
    }
    return table;
}
const vector<double> LOG2_DOUBLE = log2_double();

inline double fast_log2_double(double d) {
    uint64_t bits = std::bit_cast<uint64_t>(d);
    uint64_t e = (bits >> 52) & 0x7ff;
    uint64_t m = bits & 0xfffffffffffff;
    return (e == 0 ? LOG2_DOUBLE[m >> 28] : e + LOG2_DOUBLE[(m | 0x10000000000000) >> 29]);
}

fast_log2(3.1415927f) = 1.651496887207031 : 0.2610206604003906 nanoseconds
log2f(3.1415927f) = 1.651496171951294 : 33.27693939208984 nanoseconds
fast_log2_double(pi) = 1.651496060131421 : 0.3225326538085938 nanoseconds
fast_log2_accurate(3.1415927f) = 1.651496171951294 : 8.907032012939453 nanoseconds
fast_log2_accurate3(3.1415927f) = 1.651496171951294 : 7.831001281738281 nanoseconds
fast_log2_accurate2(3.1415927f) = 1.651496171951294 : 13.57889175415039 nanoseconds

While the two functions that use a lookup table are unbeatable, they are fairly inaccurate. I used explicitly log2f in my benchmark. As you can see, in MSVC it is quite slow.

The recursive function slows down the code significantly, as expected. Using an old style loop makes the code run 2 nanoseconds faster. However I wasn't able to benchmark the one that uses std::index_sequence, it caused compiler errors and I wasn't able to resolve it.

There was a bug in my code to benchmark that makes the recursive version timing inaccurate, it makes the measured time longer, I fixed that.

The solution from the newest answer:

inline float fast_log2_accurate4(float f) {
    uint32_t bits = std::bit_cast<uint32_t>(f);
    int e = ((bits >> 23) & 0xff) - 127;
    float m = 1 + (bits & 0x7fffff) * FU;
    float s_even = LOG2_POLY7[0];
    float s_odd = LOG2_POLY7[1] * m;
    float m2 = m * m;
    float m_even = m2;
    float m_odd = m * m2;

    for (int i = 2; i < 8; i += 2) {
        s_even += LOG2_POLY7[i] * m_even;
        s_odd += LOG2_POLY7[i + 1] * m_odd;
        m_even *= m2;
        m_odd *= m2;
    }
    return e + s_even + s_odd;
}

fast_log2_accurate4(3.1415927f) = 1.651496887207031 : 17.01173782348633 nanoseconds

It isn't as accurate as my code and takes longer, because each iteration is more expensive.

The index sequence version failed to compile previously, because I used double[8] instead of std::array<double, 8>, I thought they were the same thing! After it was pointed out, I fixed that, and it compiled successfully.

Benchmark:

ln(256) = 5.545613288879395 : 3.985881805419922 nanoseconds
log(256) = 5.545177459716797 : 7.047939300537109 nanoseconds
fast_log2(3.1415927f) = 1.651496887207031 : 0.25787353515625 nanoseconds
log2f(3.1415927f) = 1.651496171951294 : 35.03541946411133 nanoseconds
fast_log2_double(pi) = 1.651496060131421 : 0.3331184387207031 nanoseconds
fast_log2_accurate(3.1415927f) = 1.651496171951294 : 9.366512298583984 nanoseconds
fast_log2_accurate3(3.1415927f) = 1.651496171951294 : 7.454872131347656 nanoseconds
fast_log2_accurate2(3.1415927f) = 1.651496171951294 : 14.07079696655273 nanoseconds
fast_log2_accurate4(3.1415927f) = 1.651496887207031 : 16.6351318359375 nanoseconds
fast_log2_accurate5(3.1415927f) = 1.651496171951294 : 7.868862152099609 nanoseconds

It turns out ln is extremely fast, and the only way to beat it is to use a lookup table, but it only gives 3 correct decimal digits, np.log(256) gives 5.545177444479562. In contrast, my fastest functions give 6 correct decimal digits, and is ten times faster. I only need to multiply it with ln2 to get ln(x), and it would be more accurate still.

I have benchmarked the solutions multiple times, and fast_log2_accurate5 is the index sequence version. It and the old style loop version are consistently faster than my range based for loop version. Sometimes the for loop version is faster, sometimes the index sequence version. At this level the measured values fluctuate greatly, and I am running many other programs at the same time.

But it seems the performance of the index sequence version is much more stable than the for loop version and so I will accept it.

Update:

I have revisited the code, and I did just a little change to the index sequence version, I simply added inline in front of do_loop function, and this little change, makes the code run in less than a nanosecond, and I can just throw in more terms without slowing down the code too much, it would still be significantly faster than log2 while getting very accurate results.

In contrast the std::apply version is slow, even with inline:

constexpr std::array<double, 8> LOG2_POLY7A = {
    -3.23521156,
    7.08511059,
    -7.39616658,
    5.67353733,
    -2.91450247,
    0.95074541,
    -0.17811046,
    0.01459855,
};

template <std::size_t... I>
inline double do_loop(double m1, std::index_sequence<I...>) {
    double s = 0;
    double m = 1;
    ((s += std::get<I>(LOG2_POLY7A) * m, m *= m1), ...);
    return s;
}

inline float fast_log2_accurate5(float f) {
    uint32_t bits = std::bit_cast<uint32_t>(f);
    int e = ((bits >> 23) & 0xff) - 127;
    double m1 = 1 + (bits & 0x7fffff) * FU;
    double s = do_loop(m1, std::make_index_sequence<8>{});
    return e + s;
}

inline double do_loop1(double m1) {
    double s = 0;
    double m = 1;
    auto worker = [&](auto&...term) {
        ((s += term * m, m *= m1), ...);
        };
    std::apply(worker, LOG2_POLY7A);
    return s;
}

inline float fast_log2_accurate6(float f) {
    uint32_t bits = *reinterpret_cast<uint32_t*>(&f);
    int e = ((bits >> 23) & 0xff) - 127;
    double m1 = 1 + (bits & 0x7fffff) * FU;
    double s = do_loop1(m1);
    return e + s;
}

fast_log2_accurate5(3.1415927f) = 1.651496171951294 : 0.9766578674316406 nanoseconds
fast_log2_accurate6(3.1415927f) = 1.651496171951294 : 7.168102264404297 nanoseconds

Solution

You can unroll the loop with std::index_sequence, as follows:

#include <array>
#include <cstdint>
#include <utility>

constexpr std::size_t size_log2 = 8;

constexpr std::array<double, size_log2> LOG2_POLY7 = {
    -3.23521156,
    7.08511059,
    -7.39616658,
    5.67353733,
    -2.91450247,
    0.95074541,
    -0.17811046,
    0.01459855,
};
constexpr float FU = 1.0 / (1 << 23);

template <std::size_t... I>
double do_loop(double m1, std::index_sequence<I...>) {
    double s = 0;
    double m = 1;
    ((s += std::get<I>(LOG2_POLY7) * m, m *= m1),...);
    return s;
}

inline float fast_log2_accurate(float f) {
    uint32_t bits = *reinterpret_cast<uint32_t*>(&f);
    int e = ((bits >> 23) & 0xff) - 127;
    double m1 = 1 + (bits & 0x7fffff) * FU;
    double s = do_loop(m1, std::make_index_sequence<size_log2>{});
    return e + s;
}

Test the code on godbolt.

Notice that I have also moved the variable m insides the do_loop function, as it is only required there. And, per suggestion in the comments, do_loop returns the result which was stored in the variable s in the question. Compared to your initial version, this unrolling of the loop is done at compile time, and avoids, for instance, to compare p with LOG2_POLY7.end() at every iteration. As always, the actual gain should be benchmarked.

Also, as noted in the comments, the loop can be simplified by using std::apply to convert the array LOG2_POLY7 into variadic arguments; see below or the other answer.

As asked in the comment, one can generalize the do_loop function to work for a generic std::array. This implies however an additional routine. This is because, in the do_loop above, the type of the index_sequence is determined by the nontype template parameters std::size_t... I; these parameters, and hence the type of the second parameter of do_loop are deduced. Now, given a generic std::array, you can deduce the size N, but to make the do_loop work you need to convert this N to a sequence of indices 0...N-1: That's precisely what index_sequence is for.

So, for a more generic routine one can substitute the code above with:

template <std::size_t N, std::size_t... I>
double do_loop_impl(double m1, const std::array<double, N>& data, std::index_sequence<I...>) {
    double s = 0;
    double m = 1;
    ((s += std::get<I>(data) * m, m *= m1),...);
    return s;
}

template <std::size_t N>
double do_loop(double m1, const std::array<double, N>& data) {
    return do_loop_impl(m1, data, std::make_index_sequence<N>{});
}

inline float fast_log2_accurate(float f) {
    uint32_t bits = *reinterpret_cast<uint32_t*>(&f);
    int e = ((bits >> 23) & 0xff) - 127;
    double m1 = 1 + (bits & 0x7fffff) * FU;
    double s = do_loop(m1, LOG2_POLY7);
    return e + s;
}

Alternatively, one can use std::apply, slightly generalizing the other answer:

template <std::size_t N>
double do_loop(double m1, const std::array<double, N>& data) {
    return std::apply([m1](auto... p) {
                      double s = 0;
                      double m = 1;
                      ((s += p * m, m *= m1), ...);
                      return s; }, data);
}

inline float fast_log2_accurate(float f) {
    uint32_t bits = *reinterpret_cast<uint32_t*>(&f);
    int e = ((bits >> 23) & 0xff) - 127;
    double m1 = 1 + (bits & 0x7fffff) * FU;
    double s = do_loop(m1, LOG2_POLY7);
    return e + s;
}

(Remember to include <tuple> if you use std::apply).

Finally, if you are interested in speed, you could consider SIMD vectorization, see e.g. openMP SIMD directives, the vector class library, the xsimd library. This probably requires a rethinking of the loops.