stackoverflow August 5, 2025 Rep: 3,875

Estrin polynomial evaluation - Intel ICX strange slowdown for lengths 16N-1

Score

Answers

127

Views

17.7

Trend Score

Question Details

No question body available.

Answers (2)

August 6, 2025 Score: 2 Rep: 152,998 Quality: Medium Completeness: 100%

Here are some suggestions:

try not using global variables. Loading global addresses may prove inefficient in terms of generated code.
use sizet instead of int for the index and length variables: nd = (nd + 1) / 2; does not compile to a simple shift if the compiler cannot ascertain that nd is non negative.

the switch may be a source of inefficiencies. Since you can assume access to d[0] through d[7], you could turn the switch into a single branchless expression:

static const double f[] = { 0, 0, 0, 0, 1, 1, 1, 1 }; d[nd] = d[nd + 1] = d[nd + 2] = d[nd + 3] = 0; return d[0] * f[nd+3] + x * d[1] * f[nd+2] + x * x * (d[2] * f[nd+1] + x * d[3] * f[nd+0]);

try and separate logging timings and printing results: calling printf in the timing loop may flush some cache lines with adverse effects depending on the library implementation and the buffer size.

Since none of the above help, here is a modified version with easy customisation:

#define FINALDISPATCH 1
#define LOOPTHRESHOLD 6
#if FINALDISPATCH
static double finalestrin0(double x, const double *c) { return 0.0; }
static double finalestrin1(double x, const double c) { return c[0]; }
static double final_estrin2(double x, const double c) { return c[0] + x  c[1]; }
static double final_estrin3(double x, const double c) { return c[0] + x  c[1] + x  x  c[2]; }
static double final_estrin4(double x, const double c) { return c[0] + x  c[1] + x  x  (c[2] + x  c[3]); }
static double finalestrin5(double x, const double *c) { return c[0] + x * c[1] + x * x * (c[2] + x * c[3] + x * x * c[4]); }
static double finalestrin6(double x, const double c) { return c[0] + x  c[1] + x  x  (c[2] + x  c[3] + x  x  c[4] + x  x  x  c[5]); }
static double ( const final_dispatch[])(double x, const double c) = {
    finalestrin0,
    finalestrin1,
    finalestrin2,
    finalestrin3,
    finalestrin4,
    finalestrin5,
    finalestrin6,
};
#endif
double improvedestrin(double x, const double c, int len)
{
    if (len > LOOP_THRESHOLD)
    {
        int nd = len >> 1;
        for (int i = 0; i < nd; ++i)
            d[i] = c[2  i] + c[2  i + 1]  x;
        d[nd] = c[len - 1];
        len = nd + (len & 1);
        x = x;
        c = d;
        while (len > LOOP_THRESHOLD)
        {
            nd = len >> 1;
            for (int i = 0; i < nd; ++i)
                d[i] = d[2  i] + d[2  i + 1]  x;
            d[nd] = d[len - 1];
            len = nd + (len & 1);
            x = x;
        }
    }
#if FINAL_DISPATCH
    return final_dispatch[len](x, c);
#else
    switch (len) {
    case 0: return 0.0;
    case 1: return c[0];
    case 2: return c[0] + x  c[1];
#if LOOPTHRESHOLD > 2
    case 3: return c[0] + x * c[1] + x * x * c[2];
#endif
#if LOOPTHRESHOLD > 3
    case 4: return c[0] + x  c[1] + x  x  (c[2] + x  c[3]);
#endif
#if LOOPTHRESHOLD > 4
    case 5: return c[0] + x * c[1] + x * x * (c[2] + x * c[3] + x * x * c[4]);
#endif
#if LOOPTHRESHOLD > 5
    case 6: return c[0] + x  c[1] + x  x  (c[2] + x  c[3] + x  x  c[4] + x  x  x  c[5]);
#endif
    }
    return 0.0;
#endif
}

On my system (macbook M2 / clang) I get between 5 and 20% faster timings.

Yet you might want to use a simpler algorithm, without the recursive approach of Estrin algorithm yet still taking advantage of the instruction parallelism. I get much better timings from this simpler function:

double naive_poly4(double x, const double c, int len)
{
    double sum = 0.0;
    double term = 1.0;
    int i = 0, len4 = len & -4;
    for (; i < len4; i += 4) {
        sum += term  (c[i] + x  c[i + 1] + x  x  c[i + 2] + x  x  x  c[i + 3]);
        term = x  x  x  x;
    }
    if (len & 2) {
        sum += term  (c[i] + x  c[i + 1]);
        term = x  x;
        i += 2;
    }
    if (len & 1) {
        sum += term  c[i];
    }
    return sum;
}

To make it easier to test variants, I wrote an more elaborate benchmarking framework:

#include 
#include 
#include 
#include 
#include 
// these are the experimental parameters for testing
#define NMAX (35)
double d[1024];  // using fixed length global arrays for simplicity
int repeat;
void init(double dest, int n)
{
    double term = 1.0;
    dest[0] = 0;
    for (int i = 1; i < n; i++) // was previously  4)
    {
        nd = len / 2;
        for (int i = 0; i < nd; ++i)
            d[i] = c[2  i] + c[2  i + 1]  x;
        if (len & 1) d[nd++] = c[len - 1];
        while (nd > 4)
        {
            d[nd] = 0;  // zero top term and round up
            x = x;
            nd = (nd + 1) / 2;
            for (int i = 0; i < nd; ++i)
                d[i] = d[2  i] + d[2  i + 1]  x;
        }
        x = x;
    }
    else
    {
        for (int i = 0; i < 4; i++) d[i] = c[i];
        nd = len;
    }
    switch (nd) {
    case 0: return 0.0;
    case 1: return d[0];
    case 2: return d[0] + x  d[1];
    case 3: return d[0] + x  d[1] + x  x  d[2];
    case 4: return d[0] + x  d[1] + x  x  (d[2] + x  d[3]);
    }
    return 0.0;
}
double best_estrin(double x, const double c, int len)
{
    int nd;
    if (len > 4)
    {
        nd = len / 2;
        for (int i = 0; i < nd; ++i)
            d[i] = c[2  i] + c[2  i + 1]  x;
        if (len & 1) d[nd++] = c[len - 1];
        while (nd > 4)
        {
            len = nd;
            x = x;
            nd = len / 2;
            for (int i = 0; i < nd; ++i)
                d[i] = d[2  i] + d[2  i + 1]  x;
            //           if (len & 1) d[nd++] = d[len - 1];
            d[nd] = d[len - 1];
            nd += len & 1; // eliminate branch
        }
        len = nd;
        x = x;
    }
    else
        for (int i = 0; i < 4; i++) d[i] = c[i];
    switch (len) {
    case 0: return 0.0;
    case 1: return d[0];
    case 2: return d[0] + x  d[1];
    case 3: return d[0] + x  d[1] + x  x  d[2];
    case 4: return d[0] + x  d[1] + x  x  (d[2] + x  d[3]);
    }
    return 0.0;
}
#define FINALDISPATCH 1
#define LOOPTHRESHOLD 6
#if FINALDISPATCH
static double finalestrin0(double x, const double c) { return 0.0; }
static double final_estrin1(double x, const double c) { return c[0]; }
static double finalestrin2(double x, const double *c) { return c[0] + x * c[1]; }
static double finalestrin3(double x, const double c) { return c[0] + x  c[1] + x  x  c[2]; }
static double finalestrin4(double x, const double *c) { return c[0] + x * c[1] + x * x * (c[2] + x * c[3]); }
static double finalestrin5(double x, const double c) { return c[0] + x  c[1] + x  x  (c[2] + x  c[3] + x  x  c[4]); }
static double final_estrin6(double x, const double c) { return c[0] + x  c[1] + x  x  (c[2] + x  c[3] + x  x  c[4] + x  x  x  c[5]); }
static double ( const finaldispatch[])(double x, const double *c) = {
    finalestrin0,
    finalestrin1,
    finalestrin2,
    finalestrin3,
    finalestrin4,
    finalestrin5,
    finalestrin6,
};
#endif
double improvedestrin(double x, const double *c, int len)
{
    if (len > LOOPTHRESHOLD)
    {
        int nd = len >> 1;
        for (int i = 0; i < nd; ++i)
            d[i] = c[2  i] + c[2  i + 1]  x;
        d[nd] = c[len - 1];
        len = nd + (len & 1);
        x = x;
        c = d;
        while (len > LOOPTHRESHOLD)
        {
            nd = len >> 1;
            for (int i = 0; i < nd; ++i)
                d[i] = d[2 * i] + d[2 * i + 1] * x;
            d[nd] = d[len - 1];
            len = nd + (len & 1);
            x *= x;
        }
    }
#if FINALDISPATCH
    return finaldispatch[len](x, c);
#else
    switch (len) {
    case 0: return 0.0;
    case 1: return c[0];
    case 2: return c[0] + x * c[1];
#if LOOPTHRESHOLD > 2
    case 3: return c[0] + x  c[1] + x  x  c[2];
#endif
#if LOOP_THRESHOLD > 3
    case 4: return c[0] + x  c[1] + x  x  (c[2] + x  c[3]);
#endif
#if LOOP_THRESHOLD > 4
    case 5: return c[0] + x  c[1] + x  x  (c[2] + x  c[3] + x  x  c[4]);
#endif
#if LOOP_THRESHOLD > 5
    case 6: return c[0] + x  c[1] + x  x  (c[2] + x  c[3] + x  x  c[4] + x  x  x  c[5]);
#endif
    }
    return 0.0;
#endif
}
double myestrin2c(double x, const double *c, int len)
{
    //double d[64];
    while (len > 3) {
        int i;
        int len2 = len >> 1;
        for (i = 0; i < len2; i++) {
            d[i] = c[2 * i] + x * c[2 * i + 1];
        }
        d[i] = c[2 * i];
        len = (len + 1) >> 1;
        c = d;
        x = x * x;
    }
    switch (len) {
    case 0: return 0.0;
    case 1: return c[0];
    case 2: return c[0] + x * c[1];
    case 3: return c[0] + x * c[1] + x * x * c[2];
    }
    return 0.0;
}
double myestrin2d(double x, const double c, int len)
{
    //double d[64];
    if (len > 4) {
        int nd = len >> 1;
        for (int i = 0; i < nd; i++) {
            d[i] = c[2  i] + x  c[2  i + 1];
        }
        d[nd] = c[2  nd];
        len = (len + 1) >> 1;
        x = x  x;
        while (len > 4) {
            nd = len >> 1;
            for (int i = 0; i < nd; i++) {
                d[i] = d[2  i] + x  d[2  i + 1];
            }
            d[nd] = d[2  nd];
            len = (len + 1) >> 1;
            x = x  x;
        }
        c = d;
    }
    switch (len) {
    case 0: return 0.0;
    case 1: return c[0];
    case 2: return c[0] + x  c[1];
    case 3: return c[0] + x  c[1] + x  x  c[2];
    case 4: return c[0] + x  c[1] + x  x  (c[2] + x  c[3]);
    }
    return 0.0;
}
double my_estrin2e(double x, const double c, int len)
{
    //double d[64];
    if (len > 5) {
        int nd = len >> 1;
        for (int i = 0; i < nd; i++) {
            d[i] = c[2  i] + x  c[2  i + 1];
        }
        d[nd] = c[2  nd];
        len = (len + 1) >> 1;
        x = x  x;
        while (len > 5) {
            nd = len >> 1;
            for (int i = 0; i < nd; i++) {
                d[i] = d[2  i] + x  d[2  i + 1];
            }
            d[nd] = d[2  nd];
            len = (len + 1) >> 1;
            x = x  x;
        }
        c = d;
    }
    switch (len) {
    case 0: return 0.0;
    case 1: return c[0];
    case 2: return c[0] + x  c[1];
    case 3: return c[0] + x  c[1] + x  x  c[2];
    case 4: return c[0] + x  c[1] + x  x  (c[2] + x  c[3]);
    case 5: return c[0] + x  c[1] + x  x  (c[2] + x  c[3] + x  x  c[4]);
    }
    return 0.0;
}
double myestrin4c(double x, const double *c, int len)
{
    //double d[64];
    while (len > 3) {
        int i;
        int j;
        int len4 = len >> 2;
        for (i = j = 0; i < len4; i++, j += 4) {
            d[i] = c[j] + x * c[j + 1] + x * x * c[j + 2] + x * x * x * c[j + 3];
        }
        switch (len & 3) {
        case 1: d[i] = c[j]; break;
        case 2: d[i] = c[j] + x * c[j + 1]; break;
        case 3: d[i] = c[j] + x * c[j + 1] + x * x * c[j + 2]; break;
        }
        len = (len + 3) >> 2;
        c = d;
        x = x * x * x * x;
    }
    switch (len) {
    case 0: return 0.0;
    case 1: return c[0];
    case 2: return c[0] + x * c[1];
    case 3: return c[0] + x * c[1] + x * x * c[2];
    }
    return 0.0;
}
struct {
    const char *name;
    double(*fun)(double, const double*, int len);
    int skip;
#define FUNC(n,f)  { n, f, 0 }
} funcs[] = {
    FUNC("poly", naivepoly),
    FUNC("poly2", naivepoly2),
    FUNC("poly4", naivepoly4),
    FUNC("poly4s", naivepoly4s),
    FUNC("estrin2c", myestrin2c),
    FUNC("estrin2d", myestrin2d),
    FUNC("estrin2e", myestrin2e),
    FUNC("estrin4c", myestrin4c),
    FUNC("newest", newestestrin),
    FUNC("best", bestestrin),
    FUNC("improved", improvedestrin),
};
double timefun(double(myfun)(double, const double, int len), const double p, int len, double sum)
{
    double sum0, sum1;
    float x, dx, ox;
    timet tstart, tend;
    sum0 = sum1 = 0.0;
    dx = 0x1.0p-54;
    if (repeat) dx = 1.0f / repeat;
    tstart = clock();
    x = 0.0f;
    ox = -1.0f;
    //long count = 0;
    while (x = x) dx += dx;
        ox = x;
        x += dx;
        sum1 += (*myfun)((double)x, p, len);
        x += dx;
        //count += 2;
    }
    tend = clock();
    if (sum) *sum = sum0 + sum1;
    double elapsed = (double)(tend - tstart) / CLOCKSPERSEC;
    if (repeat) elapsed = elapsed * 268435520 / (repeat + 2);
    //printf("count=%ld\n", count);
    return elapsed;
}
void testfun(const char* name, double(*myfun)(double, const double*, int len), double(*reffun)(double), const double *p, int len)
{
    double x, dx, ox, err, ref, res, xmax, xmin, minerr, maxerr, sum;
    static bool doheader = true;
    if (doheader) {
        printf("%-20s %6s  %12s @ %-11s  %12s @ %-11s  %s\n\n",
               "function name", "length", "maxerr", "xmax", "minerr", "xmin", "time");
        doheader = false;
    }
    xmin = xmax = minerr = maxerr = x = 0.0;
    ox = -1.0;
    dx = 0x1.0p-16;
    {
        x = 0.0;
        while (x  maxerr) {
                maxerr = err;
                xmax = x;
            }
            if (err < minerr) {
                minerr = err;
                xmin = x;
            }
            if (ox >= x) dx += dx;
            ox = x;
            x += dx;
        }
    }
    printf("%-20s %6d  %12g @ %-11g  %12g @ %-11g  %g\n",
           name, len, maxerr, xmax, minerr, xmin, timefun(myfun, p, len, &sum));
    if (isinf(sum)) printf(" timing sum infinite!\n");
    if (isnan(sum)) printf(" timing sum is Nan\n");
}
int computechartdata(const double *plog, int nmin, int nmax) {
    printf("%10s", "N");
    for (sizet t = 0; t < sizeof(funcs) / sizeof(funcs[0]); t++) {
        if (!funcs[t].skip)
            printf(" %20s", funcs[t].name);
    }
    printf("\n");
    for (int n = nmin; n

August 5, 2025 Score: 1 Rep: 158,946 Quality: Low Completeness: 50%

Suggestions for ways to make the algorithm faster are also welcome.

Get functionality correct first.

init(double* plog, int n) assigned plog[0] to plog[n] (n + 1 array elements), yet function is called with double plog[2048] init(plog, 2048);.

Reduce type changes

Minor: printf(" %g", (float)(tend - tstart) / CLOCKSPERSEC); does a timet to float conversion, then likely a float division, that a conversion to double before calling printf(). Consider printf(" %g", (double)(tend - tstart) / CLOCKSPER_SEC); and skip the narrowing and widening of floating point types.

Export Question Data

Export this question and its answers for further analysis or reporting.

Back to Questions

Estrin polynomial evaluation - Intel ICX strange slowdown for lengths 16N-1

Question Details

Tags

Answers (2)

Analysis Metrics

Question Information

Actions

Related Questions

Export Question Data