Question Details

No question body available.

Tags

c visual-c++ numerical-methods polynomials icx

Answers (2)

August 6, 2025 Score: 2 Rep: 152,998 Quality: Medium Completeness: 100%

Here are some suggestions:

  • try not using global variables. Loading global addresses may prove inefficient in terms of generated code.

  • use sizet instead of int for the index and length variables: nd = (nd + 1) / 2; does not compile to a simple shift if the compiler cannot ascertain that nd is non negative.

  • the switch may be a source of inefficiencies. Since you can assume access to d[0] through d[7], you could turn the switch into a single branchless expression:

    static const double f[] = { 0, 0, 0, 0, 1, 1, 1, 1 };
    d[nd] = d[nd + 1] = d[nd + 2] = d[nd + 3] = 0;
    return          d[0] * f[nd+3] + x * d[1] * f[nd+2] +
            x * x * (d[2] * f[nd+1] + x * d[3] * f[nd+0]);
    
  • try and separate logging timings and printing results: calling printf in the timing loop may flush some cache lines with adverse effects depending on the library implementation and the buffer size.

Since none of the above help, here is a modified version with easy customisation:

#define FINALDISPATCH 1
#define LOOPTHRESHOLD 6
#if FINALDISPATCH
static double finalestrin0(double x, const double *c) { return 0.0; }
static double finalestrin1(double x, const double c) { return c[0]; }
static double final_estrin2(double x, const double c) { return c[0] + x  c[1]; }
static double final_estrin3(double x, const double c) { return c[0] + x  c[1] + x  x  c[2]; }
static double final_estrin4(double x, const double c) { return c[0] + x  c[1] + x  x  (c[2] + x  c[3]); }
static double finalestrin5(double x, const double *c) { return c[0] + x * c[1] + x * x * (c[2] + x * c[3] + x * x * c[4]); }
static double finalestrin6(double x, const double c) { return c[0] + x  c[1] + x  x  (c[2] + x  c[3] + x  x  c[4] + x  x  x  c[5]); }
static double ( const final_dispatch[])(double x, const double c) = {
    finalestrin0,
    finalestrin1,
    finalestrin2,
    finalestrin3,
    finalestrin4,
    finalestrin5,
    finalestrin6,
};
#endif

double improved
estrin(double x, const double c, int len) { if (len > LOOP_THRESHOLD) { int nd = len >> 1; for (int i = 0; i < nd; ++i) d[i] = c[2 i] + c[2 i + 1] x; d[nd] = c[len - 1]; len = nd + (len & 1); x = x; c = d;

while (len > LOOP_THRESHOLD) { nd = len >> 1; for (int i = 0; i < nd; ++i) d[i] = d[2
i] + d[2 i + 1] x; d[nd] = d[len - 1]; len = nd + (len & 1); x = x; } }

#if FINAL_DISPATCH return final_dispatch[len](x, c); #else switch (len) { case 0: return 0.0; case 1: return c[0]; case 2: return c[0] + x
c[1]; #if LOOPTHRESHOLD > 2 case 3: return c[0] + x * c[1] + x * x * c[2]; #endif #if LOOPTHRESHOLD > 3 case 4: return c[0] + x c[1] + x x (c[2] + x c[3]); #endif #if LOOPTHRESHOLD > 4 case 5: return c[0] + x * c[1] + x * x * (c[2] + x * c[3] + x * x * c[4]); #endif #if LOOPTHRESHOLD > 5 case 6: return c[0] + x c[1] + x x (c[2] + x c[3] + x x c[4] + x x x c[5]); #endif } return 0.0; #endif }

On my system (macbook M2 / clang) I get between 5 and 20% faster timings.

Yet you might want to use a simpler algorithm, without the recursive approach of Estrin algorithm yet still taking advantage of the instruction parallelism. I get much better timings from this simpler function:

double naive_poly4(double x, const double c, int len)
{
    double sum = 0.0;
    double term = 1.0;
    int i = 0, len4 = len & -4;
    for (; i < len4; i += 4) {
        sum += term  (c[i] + x  c[i + 1] + x  x  c[i + 2] + x  x  x  c[i + 3]);
        term = x  x  x  x;
    }
    if (len & 2) {
        sum += term  (c[i] + x  c[i + 1]);
        term = x  x;
        i += 2;
    }
    if (len & 1) {
        sum += term  c[i];
    }
    return sum;
}

To make it easier to test variants, I wrote an more elaborate benchmarking framework:

#include 
#include 
#include 
#include 
#include 

// these are the experimental parameters for testing

#define NMAX (35)

double d[1024]; // using fixed length global arrays for simplicity int repeat;

void init(double dest, int n) { double term = 1.0; dest[0] = 0; for (int i = 1; i < n; i++) // was previously 4) { nd = len / 2; for (int i = 0; i < nd; ++i) d[i] = c[2 i] + c[2 i + 1] x; if (len & 1) d[nd++] = c[len - 1];

while (nd > 4) { d[nd] = 0; // zero top term and round up x = x; nd = (nd + 1) / 2;

for (int i = 0; i < nd; ++i) d[i] = d[2
i] + d[2 i + 1] x; } x = x; } else { for (int i = 0; i < 4; i++) d[i] = c[i]; nd = len; }

switch (nd) { case 0: return 0.0; case 1: return d[0]; case 2: return d[0] + x
d[1]; case 3: return d[0] + x d[1] + x x d[2]; case 4: return d[0] + x d[1] + x x (d[2] + x d[3]); } return 0.0; }

double best_estrin(double x, const double
c, int len) { int nd; if (len > 4) { nd = len / 2; for (int i = 0; i < nd; ++i) d[i] = c[2 i] + c[2 i + 1] x; if (len & 1) d[nd++] = c[len - 1];

while (nd > 4) { len = nd; x
= x; nd = len / 2; for (int i = 0; i < nd; ++i) d[i] = d[2 i] + d[2 i + 1] x; // if (len & 1) d[nd++] = d[len - 1]; d[nd] = d[len - 1]; nd += len & 1; // eliminate branch } len = nd; x = x; } else for (int i = 0; i < 4; i++) d[i] = c[i];

switch (len) { case 0: return 0.0; case 1: return d[0]; case 2: return d[0] + x d[1]; case 3: return d[0] + x d[1] + x x d[2]; case 4: return d[0] + x d[1] + x x (d[2] + x d[3]); } return 0.0; }

#define FINALDISPATCH 1 #define LOOPTHRESHOLD 6 #if FINALDISPATCH static double finalestrin0(double x, const double c) { return 0.0; } static double final_estrin1(double x, const double c) { return c[0]; } static double finalestrin2(double x, const double *c) { return c[0] + x * c[1]; } static double finalestrin3(double x, const double c) { return c[0] + x c[1] + x x c[2]; } static double finalestrin4(double x, const double *c) { return c[0] + x * c[1] + x * x * (c[2] + x * c[3]); } static double finalestrin5(double x, const double c) { return c[0] + x c[1] + x x (c[2] + x c[3] + x x c[4]); } static double final_estrin6(double x, const double c) { return c[0] + x c[1] + x x (c[2] + x c[3] + x x c[4] + x x x c[5]); } static double ( const finaldispatch[])(double x, const double *c) = { finalestrin0, finalestrin1, finalestrin2, finalestrin3, finalestrin4, finalestrin5, finalestrin6, }; #endif

double improvedestrin(double x, const double *c, int len) { if (len > LOOPTHRESHOLD) { int nd = len >> 1; for (int i = 0; i < nd; ++i) d[i] = c[2 i] + c[2 i + 1] x; d[nd] = c[len - 1]; len = nd + (len & 1); x = x; c = d;

while (len > LOOPTHRESHOLD) { nd = len >> 1; for (int i = 0; i < nd; ++i) d[i] = d[2 * i] + d[2 * i + 1] * x; d[nd] = d[len - 1]; len = nd + (len & 1); x *= x; } }

#if FINAL
DISPATCH return finaldispatch[len](x, c); #else switch (len) { case 0: return 0.0; case 1: return c[0]; case 2: return c[0] + x * c[1]; #if LOOPTHRESHOLD > 2 case 3: return c[0] + x c[1] + x x c[2]; #endif #if LOOP_THRESHOLD > 3 case 4: return c[0] + x c[1] + x x (c[2] + x c[3]); #endif #if LOOP_THRESHOLD > 4 case 5: return c[0] + x c[1] + x x (c[2] + x c[3] + x x c[4]); #endif #if LOOP_THRESHOLD > 5 case 6: return c[0] + x c[1] + x x (c[2] + x c[3] + x x c[4] + x x x c[5]); #endif } return 0.0; #endif }

double myestrin2c(double x, const double *c, int len) { //double d[64]; while (len > 3) { int i; int len2 = len >> 1; for (i = 0; i < len2; i++) { d[i] = c[2 * i] + x * c[2 * i + 1]; } d[i] = c[2 * i]; len = (len + 1) >> 1; c = d; x = x * x; } switch (len) { case 0: return 0.0; case 1: return c[0]; case 2: return c[0] + x * c[1]; case 3: return c[0] + x * c[1] + x * x * c[2]; } return 0.0; }

double my
estrin2d(double x, const double c, int len) { //double d[64]; if (len > 4) { int nd = len >> 1; for (int i = 0; i < nd; i++) { d[i] = c[2 i] + x c[2 i + 1]; } d[nd] = c[2 nd]; len = (len + 1) >> 1; x = x x; while (len > 4) { nd = len >> 1; for (int i = 0; i < nd; i++) { d[i] = d[2 i] + x d[2 i + 1]; } d[nd] = d[2 nd]; len = (len + 1) >> 1; x = x x; } c = d; } switch (len) { case 0: return 0.0; case 1: return c[0]; case 2: return c[0] + x c[1]; case 3: return c[0] + x c[1] + x x c[2]; case 4: return c[0] + x c[1] + x x (c[2] + x c[3]); } return 0.0; }

double my_estrin2e(double x, const double
c, int len) { //double d[64]; if (len > 5) { int nd = len >> 1; for (int i = 0; i < nd; i++) { d[i] = c[2 i] + x c[2 i + 1]; } d[nd] = c[2 nd]; len = (len + 1) >> 1; x = x x; while (len > 5) { nd = len >> 1; for (int i = 0; i < nd; i++) { d[i] = d[2 i] + x d[2 i + 1]; } d[nd] = d[2 nd]; len = (len + 1) >> 1; x = x x; } c = d; } switch (len) { case 0: return 0.0; case 1: return c[0]; case 2: return c[0] + x c[1]; case 3: return c[0] + x c[1] + x x c[2]; case 4: return c[0] + x c[1] + x x (c[2] + x c[3]); case 5: return c[0] + x c[1] + x x (c[2] + x c[3] + x x c[4]); } return 0.0; }

double myestrin4c(double x, const double *c, int len) { //double d[64]; while (len > 3) { int i; int j; int len4 = len >> 2; for (i = j = 0; i < len4; i++, j += 4) { d[i] = c[j] + x * c[j + 1] + x * x * c[j + 2] + x * x * x * c[j + 3]; } switch (len & 3) { case 1: d[i] = c[j]; break; case 2: d[i] = c[j] + x * c[j + 1]; break; case 3: d[i] = c[j] + x * c[j + 1] + x * x * c[j + 2]; break; } len = (len + 3) >> 2; c = d; x = x * x * x * x; } switch (len) { case 0: return 0.0; case 1: return c[0]; case 2: return c[0] + x * c[1]; case 3: return c[0] + x * c[1] + x * x * c[2]; } return 0.0; }

struct { const char *name; double(*fun)(double, const double*, int len); int skip; #define FUNC(n,f) { n, f, 0 } } funcs[] = { FUNC("poly", naive
poly), FUNC("poly2", naivepoly2), FUNC("poly4", naivepoly4), FUNC("poly4s", naivepoly4s), FUNC("estrin2c", myestrin2c), FUNC("estrin2d", myestrin2d), FUNC("estrin2e", myestrin2e), FUNC("estrin4c", myestrin4c), FUNC("newest", newestestrin), FUNC("best", bestestrin), FUNC("improved", improvedestrin), };

double timefun(double(myfun)(double, const double, int len), const double p, int len, double sum) { double sum0, sum1; float x, dx, ox; timet tstart, tend; sum0 = sum1 = 0.0; dx = 0x1.0p-54; if (repeat) dx = 1.0f / repeat; tstart = clock(); x = 0.0f; ox = -1.0f; //long count = 0; while (x = x) dx += dx; ox = x; x += dx; sum1 += (*myfun)((double)x, p, len); x += dx; //count += 2; } tend = clock(); if (sum) *sum = sum0 + sum1; double elapsed = (double)(tend - tstart) / CLOCKSPERSEC; if (repeat) elapsed = elapsed * 268435520 / (repeat + 2); //printf("count=%ld\n", count); return elapsed; }

void testfun(const char* name, double(*myfun)(double, const double*, int len), double(*reffun)(double), const double *p, int len) { double x, dx, ox, err, ref, res, xmax, xmin, minerr, maxerr, sum; static bool doheader = true;

if (doheader) { printf("%-20s %6s %12s @ %-11s %12s @ %-11s %s\n\n", "function name", "length", "max
err", "xmax", "minerr", "xmin", "time"); doheader = false; } xmin = xmax = minerr = maxerr = x = 0.0; ox = -1.0; dx = 0x1.0p-16; { x = 0.0; while (x maxerr) { maxerr = err; xmax = x; } if (err < minerr) { minerr = err; xmin = x; } if (ox >= x) dx += dx; ox = x; x += dx; } } printf("%-20s %6d %12g @ %-11g %12g @ %-11g %g\n", name, len, maxerr, xmax, minerr, xmin, timefun(myfun, p, len, &sum)); if (isinf(sum)) printf(" timing sum infinite!\n"); if (isnan(sum)) printf(" timing sum is Nan\n"); }

int compute
chartdata(const double *plog, int nmin, int nmax) { printf("%10s", "N"); for (sizet t = 0; t < sizeof(funcs) / sizeof(funcs[0]); t++) { if (!funcs[t].skip) printf(" %20s", funcs[t].name); } printf("\n"); for (int n = nmin; n
August 5, 2025 Score: 1 Rep: 158,946 Quality: Low Completeness: 50%

Suggestions for ways to make the algorithm faster are also welcome.

Get functionality correct first.

init(double* plog, int n) assigned plog[0] to plog[n] (n + 1 array elements), yet function is called with double plog[2048] init(plog, 2048);.

Reduce type changes

Minor: printf(" %g", (float)(tend - tstart) / CLOCKSPERSEC); does a timet to float conversion, then likely a float division, that a conversion to double before calling printf(). Consider printf(" %g", (double)(tend - tstart) / CLOCKSPER_SEC); and skip the narrowing and widening of floating point types.