Matrix Science header

Source code for the Shapiro-Wilk W test algorithm
[Quantitation analysis for Mascot Server and Distiller]

/*
##############################################################################
# File: ms_shapiro_wilk.cpp                                                  #
# Mascot Parser toolkit                                                      #
# Source file for Shapiro-Wilk W test                                        #
##############################################################################
#    $Source: /vol/cvsroot/parser/src/tools/ms_shapiro_wilk.cpp,v $
#    $Author: francoisr $ 
#      $Date: 2016/05/17 12:49:20 $ 
#  $Revision: 1.10 $
##############################################################################
*/
/*
##############################################################################
# Applied Statistics algorithms                                              #
#                                                                            #
# Translated and adapted from routines that originally appeared in the       #
# Journal of the Royal Statistical Society Series C (Applied Statistics)     #
# and made available through StatLib http://lib.stat.cmu.edu/                #
#                                                                            #
# The implementation and source code for this algorithm is available as a    #
# free download from http://www.matrixscience.com/msparser.html              #
# to conform with the requirement that no fee is charged for their use       #
##############################################################################
*/


// STDAFX.H includes windows.h, msparser.hpp and other commonly used headers
#include "stdafx.h"

#include <cmath>
#include <limits>
#include <algorithm>


using namespace matrix_science;


ms_shapiro_wilk::ms_shapiro_wilk()
    :w_(0)
    ,pw_(0)
    ,ifault_(0) 
{
}

ms_shapiro_wilk::ms_shapiro_wilk(std::deque<std::pair<size_t,double> > x,
                                 long n,
                                 long n1,
                                 long n2)
    :w_(0)
    ,pw_(0)
    ,ifault_(0) 
{
    swilk(false, x, n, n1, n2, a_, w_, pw_, ifault_);
}

ms_shapiro_wilk::~ms_shapiro_wilk()
{
}


double ms_shapiro_wilk::getResult() const
{
    return w_;
}


double ms_shapiro_wilk::getPValue() const
{
    return pw_;
}


double ms_shapiro_wilk::getErrorCode() const
{
    return ifault_;
}


void ms_shapiro_wilk::appendSampleValue(double x) {
    std::deque<std::pair<size_t,double> >::size_type idx = x_.size();
    x_.push_back(std::make_pair(idx, x));
    w_      = 0;
    pw_     = 0;
    ifault_ = 0;
}

void ms_shapiro_wilk::clearSampleValues() {
    x_.clear();
    w_      = 0;
    pw_     = 0;
    ifault_ = 0;
}

void ms_shapiro_wilk::calculate(long n, long n1, long n2) {
    swilk(false, x_, n, n1, n2, a_, w_, pw_, ifault_);
}

void ms_shapiro_wilk::swilk(bool init,
                            std::deque<std::pair<size_t,double> > x,
                            long n,
                            long n1,
                            long n2,
                            std::deque<double> &a,
                            double &w,
                            double &pw,
                            int &ifault) {
    /* Algorithm AS R94, Journal of the Royal Statistical Society Series C
     * (Applied Statistics) vol. 44, no. 4, pp. 547-551 (1995).
     */

    w = 0; // stop the behaviour change 'feature'
    pw = 0;
    ifault = 0;

    /* Initialized data */
    static const double zero = 0.0;
    static const double one = 1.0;
    static const double two = 2.0;
    static const double three = 3.0;

    static const double z90 = 1.2816;
    static const double z95 = 1.6449;
    static const double z99 = 2.3263;
    static const double zm = 1.7509;
    static const double zss = 0.56268;
    static const double bf1 = 0.8378;
    static const double xx90 = 0.556;
    static const double xx95 = 0.622;
    static const double sqrth = 0.70711;/* sqrt(1/2) = .7071068 */
    static const double small_value = 1e-19;
    static const double pi6 = 1.909859;
    static const double stqr = 1.047198;

    /* polynomial coefficients */
    static const double g[2] = { -2.273,.459 };
    static const double c1[6] = { 0.,.221157,-.147981,-2.07119, 4.434685, -2.706056 },
                     c2[6] = { 0.,.042981,-.293762,-1.752461,5.682633, -3.582633 };
    static const double c3[4] = { .544,-.39978,.025054,-6.714e-4 };
    static const double c4[4] = { 1.3822,-.77857,.062767,-.0020322 };
    static const double c5[4] = { -1.5861,-.31082,-.083751,.0038915 };
    static const double c6[3] = { -.4803,-.082676,.0030302 };
    static const double c7[2] = { .164,.533 };
    static const double c8[2] = { .1736,.315 };
    static const double c9[2] = { .256,-.00635 };

    /* System generated locals */
    double r__1;

    a.resize(n, 0.0);

    /*
     * Auxiliary routines : poly()  {below}
     */

    /* Local variables */
    int i, j, ncens, i1, nn2;

    double zbar, ssassx, summ2, ssumm2, gamma, delta, range;
    double a1, a2, an, bf, ld, m, s, sa, xi, sx, xx, y, w1;
    double fac, asa, an25, ssa, z90f, sax, zfm, z95f, zsd, z99f, rsn, ssx, xsx;

    /* Parameter adjustments */
    pw = 1.0;
    ifault = 0;
    if (w >= 0.0) {
        w = 1.0;
    }
    an = (double) (n);
    nn2 = n / 2;
    if (n2 < nn2) {
        ifault = 3; 
        return;
    }
    if (n < 3) {
        ifault = 1; 
        return;
    }

    /*  If INIT is false, calculate coefficients a[] for the test */
    if (! (init)) {
        if (n==3) {
                a[0] = sqrth;
        } else {
            an25 = an + .25;
            summ2 = zero;
            for (i = 1; i <= n2; ++i) {
                a[i-1] = (double) ppnd7((i - .375f) / an25, ifault);
                if (ifault) {
                    ifault = 8;
                    return;
                }
                summ2 += (a[i-1] * a[i-1]);
            }
            summ2 *= two;
            ssumm2 = std::sqrt(summ2);
            rsn = one / std::sqrt(an);
            a1 = poly(c1, 6, rsn) - a[0] / ssumm2;

            /* Normalize a[] */
            if (n > 5) {
                i1 = 3;
                a2 = -a[1] / ssumm2 + poly(c2, 6, rsn);
                fac = std::sqrt((summ2 - two * a[0] * a[0] - two * a[1] * a[1])
                            / (one - two * a1 * a1 - two * a2 * a2));
                a[1] = a2;
            } else {
                i1 = 2;
                fac = std::sqrt((summ2 - two * a[0] * a[0]) / ( one  - two * a1 * a1));
            }
            a[0] = a1;
            for (i = i1; i <= nn2; ++i) a[i-1] /= - fac;
        }
        init = true;
    }
    if (n1 < 3) {
        ifault = 1; 
        return;
    }
    ncens = n - n1;
    if (ncens < 0 || (ncens > 0 && n < 20)) {
        ifault = 4; 
        return;
    }
    delta = (double) ncens / an;
    if (delta > 0.8) {
        ifault = 5; 
        return;
    }

    /*  If W input as negative, calculate significance level of -W */
    if (w < zero) {
        w1 = 1. + w;
        ifault = 0;
        goto L70;
    }

    /*  Check for zero range */
    range = x[n1 - 1].second - x[0].second;
    if (range < small_value) {
        ifault = 6; 
        return;
    }

    /*  Check for correct sort order on range - scaled X */

    /* *ifault = 7; <-- a no-op, since it is set 0, below, in ANY CASE! */
    ifault = 0;
    xx = x[0].second / range;
    sx = xx;
    sa = -a[0];
    j = n - 1;
    for (i = 2; i <= n1; ++i) {
        xi = x[i-1].second / range;
        if (xx - xi > small_value) {
            /* Fortran had:  print *, "ANYTHING"
            * but do NOT; it *does* happen with sorted x (on Intel GNU/linux):
            *  shapiro.test(c(-1.7, -1,-1,-.73,-.61,-.5,-.24, .45,.62,.81,1))
            */
            ifault = 7;
            return;
        }
        sx += xi;
        if (i != j) sa += sign(1,i - j) * a[std::min(i,j)-1];
        xx = xi;
        --j;
    }
    if (n > 5000) {
        ifault = 2;
    }

/*  Calculate W statistic as squared correlation
    between data and coefficients */

    sa /= n1;
    sx /= n1;
    ssa = ssx = sax = zero;
    j = n;
    for (i = 1; i <= n1; ++i, --j) {
        if (i != j)
            asa = sign(1,i - j) * a[std::min(i,j)-1] - sa;
        else
            asa = -sa;
        xsx = x[i-1].second / range - sx;
        ssa += asa * asa;
        ssx += xsx * xsx;
        sax += asa * xsx;
    }

/*  W1 equals (1-W) claculated to avoid excessive rounding error
    for W very near 1 (a potential problem in very large samples) */

    ssassx = std::sqrt(ssa * ssx);
    w1 = (ssassx - sax) * (ssassx + sax) / (ssa * ssx);
L70:
    w = 1. - w1;

/*  Calculate significance level for W */

    if (n == 3) {/* exact P value : */
        pw = pi6 * (std::asin(std::sqrt(w)) - stqr);
        return;
    }
    y = std::log(w1);
    xx = std::log(an);
    m = zero;
    s = one;
    if (n <= 11) {
        gamma = poly(g, 2, an);
        if (y >= gamma) {
            pw = small_value;/* FIXME: rather use an even small_valueer value, or NA ? */
            return;
        }
        y = -std::log(gamma - y);
        m = poly(c3, 4, an);
        s = std::exp(poly(c4, 4, an));
    } else {/* n >= 12 */
        m = poly(c5, 4, xx);
        s = std::exp(poly(c6, 3, xx));
    }
    /*DBG printf("c(w1=%g, w=%g, y=%g, m=%g, s=%g)\n",w1,*w,y,m,s); */

    if (ncens > 0) {/* <==>  n > n1 */
    /*  Censoring by proportion NCENS/N.
        Calculate mean and sd of normal equivalent deviate of W. */

        ld = -std::log(delta);
        bf = one + xx * bf1;
        r__1 = pow(xx90, (double) xx);
        z90f = z90 + bf * std::pow(poly(c7, 2, r__1), (double) ld);
        r__1 = pow(xx95, (double) xx);
        z95f = z95 + bf * std::pow(poly(c8, 2, r__1), (double) ld);
        z99f = z99 + bf * std::pow(poly(c9, 2, xx), (double)ld);

        /* Regress Z90F,...,Z99F on normal deviates Z90,...,Z99 to get
         * pseudo-mean and pseudo-sd of z as the slope and intercept 
         */

        zfm = (z90f + z95f + z99f) / three;
        zsd = (z90 * (z90f - zfm) +
                    z95 * (z95f - zfm) + z99 * (z99f - zfm)) / zss;
        zbar = zfm - zsd * zm;
        m += zbar * s;
        s *= zsd;
    }
    pw = alnorm((y-m)/s, true);
    /*  = alnorm_(dble((Y - M)/S), 1); */

    // Results are returned in w, pw and ifault
    return;
} /* swilk */


double ms_shapiro_wilk::poly(const double *cc, int nord, double x)
{
    /* Auxiliary procedure in algorithm AS 181, Journal of the Royal 
     * Statistical Society Series C (Applied Statistics) vol. 31, no. 2, 
     * pp. 176-180 (1982).
     */

    /* Local variables */

    int n2,j;
    double ret_val = cc[0];
    if (nord == 1) return ret_val;
    double p = x * cc[nord-1];
    if (nord != 2) {
        n2 = nord - 2;
        j = n2 + 1;
        for (int i=1;i<=n2;i++) {
            p = (p + cc[j-1])*x;
            j--;
        }
    }
    ret_val = ret_val + p;
    return ret_val;
} /* poly */


double ms_shapiro_wilk::ppnd7(double p, int &ifault) {
    /* Algorithm AS 241, Journal of the Royal Statistical Society Series C
     * (Applied Statistics) vol. 26, no. 3, pp. 118-121 (1977).
     */
  
    static const double zero = 0.0;
    static const double one = 1.0;
    static const double half = 0.5;
    static const double split1 = 0.425;
    static const double split2 = 5.0;
    static const double const1 = 0.180625;
    static const double const2 = 1.6;
    static const double a0 = 3.3871327179E+00;
    static const double a1 = 5.0434271938E+01;
    static const double a2 = 1.5929113202E+02;
    static const double a3 = 5.9109374720E+01;
    static const double b1 = 1.7895169469E+01;
    static const double b2 = 7.8757757664E+01;
    static const double b3 = 6.7187563600E+01;
    static const double c0 = 1.4234372777E+00;
    static const double c1 = 2.7568153900E+00;
    static const double c2 = 1.3067284816E+00;
    static const double c3 = 1.7023821103E-01;
    static const double d1 = 7.3700164250E-01;
    static const double d2 = 1.2021132975E-01;
    static const double e0 = 6.6579051150E+00;
    static const double e1 = 3.0812263860E+00;
    static const double e2 = 4.2868294337E-01;
    static const double e3 = 1.7337203997E-02;
    static const double f1 = 2.4197894225E-01;
    static const double f2 = 1.2258202635E-02;

    double normal_dev;
    double q;
    double r;

    ifault = 0;
    q = p - half;
    if (std::abs(q) <= split1) {
        r = const1 - q * q;
        normal_dev = q * (((a3 * r + a2) * r + a1) * r + a0) / 
                     (((b3 * r + b2) * r + b1) * r + one);
        return normal_dev;
    } else {
        if (q < zero) {
            r = p;
        } else {
            r = one - p;
        }
        if (r <= zero) {
            ifault = 1;
            normal_dev = zero;
            return normal_dev;
        }
        r = std::sqrt(-std::log(r));
        if (r <= split2) {
            r = r - const2;
            normal_dev = (((c3 * r + c2) * r + c1) * r + c0) / ((d2 * r + d1) * r + one);
        } else {
            r = r - split2;
            normal_dev = (((e3 * r + e2) * r + e1) * r + e0) / ((f2 * r + f1) * r + one);
        }
        if (q < zero) { normal_dev = - normal_dev; }
        return normal_dev;
    }
}


double ms_shapiro_wilk::alnorm(double x,bool upper) {
    /* Algorithm AS 66, Journal of the Royal Statistical Society Series C
     * (Applied Statistics) vol. 22, pp. 424-427 (1973).
     */

    static const double zero = 0;
    static const double one = 1;
    static const double half = 0.5;
    static const double con = 1.28;
    static const double ltone = 7.0;
    static const double utzero = 18.66;
    static const double p = 0.398942280444;
    static const double q = 0.39990348504;
    static const double r = 0.398942280385;   
    static const double a1 = 5.75885480458;
    static const double a2 = 2.62433121679;
    static const double a3 = 5.92885724438;  
    static const double b1 = -29.8213557807;
    static const double b2 = 48.6959930692;
    static const double c1 = -3.8052E-8;
    static const double c2 = 3.98064794E-4;
    static const double c3 = -0.151679116635;
    static const double c4 = 4.8385912808;
    static const double c5 = 0.742380924027;
    static const double c6 = 3.99019417011;  
    static const double d1 = 1.00000615302;
    static const double d2 = 1.98615381364;
    static const double d3 = 5.29330324926;  
    static const double d4 = -15.1508972451;
    static const double d5 = 30.789933034;
  
    double alnorm;
    double z;
    double y;
    bool up = upper;
    z = x;
    if (z < zero){
        up = !up;
        z = -z;
    }
    if (z <= ltone || (up && z <= utzero)) {
        y = half * z * z;
        if (z > con) {
            alnorm = r * std::exp(-y) / (z + c1 + d1 / (z + c2 + d2 / (z + c3 + d3 
                    / (z + c4 + d4 / (z + c5 + d5 / (z + c6))))));
        } else {
            alnorm = half - z * (p - q * y/(y + a1 + b1 /(y + a2 + b2/(y + a3))));
        }
    } else {
        alnorm = zero;
    }

    if(!up) { alnorm = one - alnorm; }
    return alnorm;
}

long ms_shapiro_wilk::sign(long x,long y) {
    if (y<0) 
        return -std::labs(x);
    else
        return std::labs(x);
}

Copyright © 2016 Matrix Science Ltd.  All Rights Reserved. Generated on Fri Jun 2 2017 01:44:51