msparser/help/group___shapiro_wilk_source_code.html

/*

##############################################################################

# File: ms_shapiro_wilk.cpp                                                  #

# Mascot Parser toolkit                                                      #

# Source file for Shapiro-Wilk W test                                        #

##############################################################################

#    $Source$

#    $Author$

#      $Date$

#  $Revision$

##############################################################################

*/

/*

##############################################################################

# Applied Statistics algorithms                                              #

#                                                                            #

# Translated and adapted from routines that originally appeared in the       #

# Journal of the Royal Statistical Society Series C (Applied Statistics)     #

# and made available through StatLib http://lib.stat.cmu.edu/                #

#                                                                            #

# The implementation and source code for this algorithm is available as a    #

# free download from http://www.matrixscience.com/msparser.html              #

# to conform with the requirement that no fee is charged for their use       #

##############################################################################

*/


// STDAFX.H includes windows.h, msparser.hpp and other commonly used headers

#include "stdafx.h"


#include <cmath>

#include <limits>

#include <algorithm>


using namespace matrix_science;


ms_shapiro_wilk::ms_shapiro_wilk()

    :w_(0)

    ,pw_(0)

    ,ifault_(0)

{

}


ms_shapiro_wilk::ms_shapiro_wilk(std::deque<std::pair<size_t,double> > x,

                                 long n,

                                 long n1,

                                 long n2)

    :w_(0)

    ,pw_(0)

    ,ifault_(0)

{

    swilk(false, x, n, n1, n2, a_, w_, pw_, ifault_);

}


ms_shapiro_wilk::~ms_shapiro_wilk()

{

}


double ms_shapiro_wilk::getResult() const

{

    return w_;

}


double ms_shapiro_wilk::getPValue() const

{

    return pw_;

}


double ms_shapiro_wilk::getErrorCode() const

{

    return ifault_;

}


void ms_shapiro_wilk::appendSampleValue(double x) {

    std::deque<std::pair<size_t,double> >::size_type idx = x_.size();

    x_.push_back(std::make_pair(idx, x));

    w_      = 0;

    pw_     = 0;

    ifault_ = 0;

}


void ms_shapiro_wilk::clearSampleValues() {

    x_.clear();

    w_      = 0;

    pw_     = 0;

    ifault_ = 0;

}


void ms_shapiro_wilk::calculate(long n, long n1, long n2) {

    swilk(false, x_, n, n1, n2, a_, w_, pw_, ifault_);

}


void ms_shapiro_wilk::swilk(bool init,

                            std::deque<std::pair<size_t,double> > x,

                            long n,

                            long n1,

                            long n2,

                            std::deque<double> &a,

                            double &w,

                            double &pw,

                            int &ifault) {

    /* Algorithm AS R94, Journal of the Royal Statistical Society Series C

     * (Applied Statistics) vol. 44, no. 4, pp. 547-551 (1995).

     */


    w = 0; // stop the behaviour change 'feature'

    pw = 0;

    ifault = 0;


    /* Initialized data */

    static const double zero = 0.0;

    static const double one = 1.0;

    static const double two = 2.0;

    static const double three = 3.0;


    static const double z90 = 1.2816;

    static const double z95 = 1.6449;

    static const double z99 = 2.3263;

    static const double zm = 1.7509;

    static const double zss = 0.56268;

    static const double bf1 = 0.8378;

    static const double xx90 = 0.556;

    static const double xx95 = 0.622;

    static const double sqrth = 0.70711;/* sqrt(1/2) = .7071068 */

    static const double small_value = 1e-19;

    static const double pi6 = 1.909859;

    static const double stqr = 1.047198;


    /* polynomial coefficients */

    static const double g[2] = { -2.273,.459 };

    static const double c1[6] = { 0.,.221157,-.147981,-2.07119, 4.434685, -2.706056 },

                     c2[6] = { 0.,.042981,-.293762,-1.752461,5.682633, -3.582633 };

    static const double c3[4] = { .544,-.39978,.025054,-6.714e-4 };

    static const double c4[4] = { 1.3822,-.77857,.062767,-.0020322 };

    static const double c5[4] = { -1.5861,-.31082,-.083751,.0038915 };

    static const double c6[3] = { -.4803,-.082676,.0030302 };

    static const double c7[2] = { .164,.533 };

    static const double c8[2] = { .1736,.315 };

    static const double c9[2] = { .256,-.00635 };


    /* System generated locals */

    double r__1;


    a.resize(n, 0.0);


    /*

     * Auxiliary routines : poly()  {below}

     */


    /* Local variables */

    int i, j, ncens, i1, nn2;


    double zbar, ssassx, summ2, ssumm2, gamma, delta, range;

    double a1, a2, an, bf, ld, m, s, sa, xi, sx, xx, y, w1;

    double fac, asa, an25, ssa, z90f, sax, zfm, z95f, zsd, z99f, rsn, ssx, xsx;


    /* Parameter adjustments */

    pw = 1.0;

    ifault = 0;

    if (w >= 0.0) {

        w = 1.0;

    }

    an = (double) (n);

    nn2 = n / 2;

    if (n2 < nn2) {

        ifault = 3;

        return;

    }

    if (n < 3) {

        ifault = 1;

        return;

    }


    /*  If INIT is false, calculate coefficients a[] for the test */

    if (! (init)) {

        if (n==3) {

                a[0] = sqrth;

        } else {

            an25 = an + .25;

            summ2 = zero;

            for (i = 1; i <= n2; ++i) {

                a[i-1] = (double) ppnd7((i - .375f) / an25, ifault);

                if (ifault) {

                    ifault = 8;

                    return;

                }

                summ2 += (a[i-1] * a[i-1]);

            }

            summ2 *= two;

            ssumm2 = std::sqrt(summ2);

            rsn = one / std::sqrt(an);

            a1 = poly(c1, 6, rsn) - a[0] / ssumm2;


            /* Normalize a[] */

            if (n > 5) {

                i1 = 3;

                a2 = -a[1] / ssumm2 + poly(c2, 6, rsn);

                fac = std::sqrt((summ2 - two * a[0] * a[0] - two * a[1] * a[1])

                            / (one - two * a1 * a1 - two * a2 * a2));

                a[1] = a2;

            } else {

                i1 = 2;

                fac = std::sqrt((summ2 - two * a[0] * a[0]) / ( one  - two * a1 * a1));

            }

            a[0] = a1;

            for (i = i1; i <= nn2; ++i) a[i-1] /= - fac;

        }

        init = true;

    }

    if (n1 < 3) {

        ifault = 1;

        return;

    }

    ncens = n - n1;

    if (ncens < 0 || (ncens > 0 && n < 20)) {

        ifault = 4;

        return;

    }

    delta = (double) ncens / an;

    if (delta > 0.8) {

        ifault = 5;

        return;

    }


    /*  If W input as negative, calculate significance level of -W */

    if (w < zero) {

        w1 = 1. + w;

        ifault = 0;

        goto L70;

    }


    /*  Check for zero range */

    range = x[n1 - 1].second - x[0].second;

    if (range < small_value) {

        ifault = 6;

        return;

    }


    /*  Check for correct sort order on range - scaled X */


    /* *ifault = 7; <-- a no-op, since it is set 0, below, in ANY CASE! */

    ifault = 0;

    xx = x[0].second / range;

    sx = xx;

    sa = -a[0];

    j = n - 1;

    for (i = 2; i <= n1; ++i) {

        xi = x[i-1].second / range;

        if (xx - xi > small_value) {

            /* Fortran had:  print *, "ANYTHING"

            * but do NOT; it *does* happen with sorted x (on Intel GNU/linux):

            *  shapiro.test(c(-1.7, -1,-1,-.73,-.61,-.5,-.24, .45,.62,.81,1))

            */

            ifault = 7;

            return;

        }

        sx += xi;

        if (i != j) sa += sign(1,i - j) * a[std::min(i,j)-1];

        xx = xi;

        --j;

    }

    if (n > 5000) {

        ifault = 2;

    }


/*  Calculate W statistic as squared correlation

    between data and coefficients */


    sa /= n1;

    sx /= n1;

    ssa = ssx = sax = zero;

    j = n;

    for (i = 1; i <= n1; ++i, --j) {

        if (i != j)

            asa = sign(1,i - j) * a[std::min(i,j)-1] - sa;

        else

            asa = -sa;

        xsx = x[i-1].second / range - sx;

        ssa += asa * asa;

        ssx += xsx * xsx;

        sax += asa * xsx;

    }


/*  W1 equals (1-W) claculated to avoid excessive rounding error

    for W very near 1 (a potential problem in very large samples) */


    ssassx = std::sqrt(ssa * ssx);

    w1 = (ssassx - sax) * (ssassx + sax) / (ssa * ssx);

L70:

    w = 1. - w1;


/*  Calculate significance level for W */


    if (n == 3) {/* exact P value : */

        pw = pi6 * (std::asin(std::sqrt(w)) - stqr);

        return;

    }

    y = std::log(w1);

    xx = std::log(an);

    m = zero;

    s = one;

    if (n <= 11) {

        gamma = poly(g, 2, an);

        if (y >= gamma) {

            pw = small_value;/* FIXME: rather use an even small_valueer value, or NA ? */

            return;

        }

        y = -std::log(gamma - y);

        m = poly(c3, 4, an);

        s = std::exp(poly(c4, 4, an));

    } else {/* n >= 12 */

        m = poly(c5, 4, xx);

        s = std::exp(poly(c6, 3, xx));

    }

    /*DBG printf("c(w1=%g, w=%g, y=%g, m=%g, s=%g)\n",w1,*w,y,m,s); */


    if (ncens > 0) {/* <==>  n > n1 */

    /*  Censoring by proportion NCENS/N.

        Calculate mean and sd of normal equivalent deviate of W. */


        ld = -std::log(delta);

        bf = one + xx * bf1;

        r__1 = pow(xx90, (double) xx);

        z90f = z90 + bf * std::pow(poly(c7, 2, r__1), (double) ld);

        r__1 = pow(xx95, (double) xx);

        z95f = z95 + bf * std::pow(poly(c8, 2, r__1), (double) ld);

        z99f = z99 + bf * std::pow(poly(c9, 2, xx), (double)ld);


        /* Regress Z90F,...,Z99F on normal deviates Z90,...,Z99 to get

         * pseudo-mean and pseudo-sd of z as the slope and intercept

         */


        zfm = (z90f + z95f + z99f) / three;

        zsd = (z90 * (z90f - zfm) +

                    z95 * (z95f - zfm) + z99 * (z99f - zfm)) / zss;

        zbar = zfm - zsd * zm;

        m += zbar * s;

        s *= zsd;

    }

    pw = alnorm((y-m)/s, true);

    /*  = alnorm_(dble((Y - M)/S), 1); */


    // Results are returned in w, pw and ifault

    return;

} /* swilk */


double ms_shapiro_wilk::poly(const double *cc, int nord, double x)

{

    /* Auxiliary procedure in algorithm AS 181, Journal of the Royal

     * Statistical Society Series C (Applied Statistics) vol. 31, no. 2,

     * pp. 176-180 (1982).

     */


    /* Local variables */


    int n2,j;

    double ret_val = cc[0];

    if (nord == 1) return ret_val;

    double p = x * cc[nord-1];

    if (nord != 2) {

        n2 = nord - 2;

        j = n2 + 1;

        for (int i=1;i<=n2;i++) {

            p = (p + cc[j-1])*x;

            j--;

        }

    }

    ret_val = ret_val + p;

    return ret_val;

} /* poly */


double ms_shapiro_wilk::ppnd7(double p, int &ifault) {

    /* Algorithm AS 241, Journal of the Royal Statistical Society Series C

     * (Applied Statistics) vol. 26, no. 3, pp. 118-121 (1977).

     */


    static const double zero = 0.0;

    static const double one = 1.0;

    static const double half = 0.5;

    static const double split1 = 0.425;

    static const double split2 = 5.0;

    static const double const1 = 0.180625;

    static const double const2 = 1.6;

    static const double a0 = 3.3871327179E+00;

    static const double a1 = 5.0434271938E+01;

    static const double a2 = 1.5929113202E+02;

    static const double a3 = 5.9109374720E+01;

    static const double b1 = 1.7895169469E+01;

    static const double b2 = 7.8757757664E+01;

    static const double b3 = 6.7187563600E+01;

    static const double c0 = 1.4234372777E+00;

    static const double c1 = 2.7568153900E+00;

    static const double c2 = 1.3067284816E+00;

    static const double c3 = 1.7023821103E-01;

    static const double d1 = 7.3700164250E-01;

    static const double d2 = 1.2021132975E-01;

    static const double e0 = 6.6579051150E+00;

    static const double e1 = 3.0812263860E+00;

    static const double e2 = 4.2868294337E-01;

    static const double e3 = 1.7337203997E-02;

    static const double f1 = 2.4197894225E-01;

    static const double f2 = 1.2258202635E-02;


    double normal_dev;

    double q;

    double r;


    ifault = 0;

    q = p - half;

    if (std::abs(q) <= split1) {

        r = const1 - q * q;

        normal_dev = q * (((a3 * r + a2) * r + a1) * r + a0) /

                     (((b3 * r + b2) * r + b1) * r + one);

        return normal_dev;

    } else {

        if (q < zero) {

            r = p;

        } else {

            r = one - p;

        }

        if (r <= zero) {

            ifault = 1;

            normal_dev = zero;

            return normal_dev;

        }

        r = std::sqrt(-std::log(r));

        if (r <= split2) {

            r = r - const2;

            normal_dev = (((c3 * r + c2) * r + c1) * r + c0) / ((d2 * r + d1) * r + one);

        } else {

            r = r - split2;

            normal_dev = (((e3 * r + e2) * r + e1) * r + e0) / ((f2 * r + f1) * r + one);

        }

        if (q < zero) { normal_dev = - normal_dev; }

        return normal_dev;

    }

}


double ms_shapiro_wilk::alnorm(double x,bool upper) {

    /* Algorithm AS 66, Journal of the Royal Statistical Society Series C

     * (Applied Statistics) vol. 22, pp. 424-427 (1973).

     */


    static const double zero = 0;

    static const double one = 1;

    static const double half = 0.5;

    static const double con = 1.28;

    static const double ltone = 7.0;

    static const double utzero = 18.66;

    static const double p = 0.398942280444;

    static const double q = 0.39990348504;

    static const double r = 0.398942280385;

    static const double a1 = 5.75885480458;

    static const double a2 = 2.62433121679;

    static const double a3 = 5.92885724438;

    static const double b1 = -29.8213557807;

    static const double b2 = 48.6959930692;

    static const double c1 = -3.8052E-8;

    static const double c2 = 3.98064794E-4;

    static const double c3 = -0.151679116635;

    static const double c4 = 4.8385912808;

    static const double c5 = 0.742380924027;

    static const double c6 = 3.99019417011;

    static const double d1 = 1.00000615302;

    static const double d2 = 1.98615381364;

    static const double d3 = 5.29330324926;

    static const double d4 = -15.1508972451;

    static const double d5 = 30.789933034;


    double alnorm;

    double z;

    double y;

    bool up = upper;

    z = x;

    if (z < zero){

        up = !up;

        z = -z;

    }

    if (z <= ltone || (up && z <= utzero)) {

        y = half * z * z;

        if (z > con) {

            alnorm = r * std::exp(-y) / (z + c1 + d1 / (z + c2 + d2 / (z + c3 + d3

                    / (z + c4 + d4 / (z + c5 + d5 / (z + c6))))));

        } else {

            alnorm = half - z * (p - q * y/(y + a1 + b1 /(y + a2 + b2/(y + a3))));

        }

    } else {

        alnorm = zero;

    }


    if(!up) { alnorm = one - alnorm; }

    return alnorm;

}


long ms_shapiro_wilk::sign(long x,long y) {

    if (y<0)

        return -std::labs(x);

    else

        return std::labs(x);

}