Matrix Science header

tools_stats.cs

Using statistical routines.

/*
##############################################################################
# file: tools_stats.cs                                                       #
# 'msparser' toolkit example code                                            #
##############################################################################
# COPYRIGHT NOTICE                                                           #
# Copyright 1998-2015 Matrix Science Limited  All Rights Reserved.           #
#                                                                            #
##############################################################################
#    $Source: /vol/cvsroot/parser/examples/test_csharp/tools_stats.cs,v $                                #
#     $Author: patricke $                                                    #
#       $Date: 2015/07/31 14:38:19 $                                         #
#   $Revision: 1.2 $                                                         #
# $NoKeywords::                                                            $ #
##############################################################################
*/

using System;
using System.Collections.Generic;
using matrix_science.msparser;

namespace MsParserExamples
{
    class tools_stats
    {

        public static void Main(string[] args)
        {
            Console.WriteLine("Binomial coefficients for sample size 6:");

            for (int i = 0; i <= 6; i++)
            {
                Console.WriteLine("\t6 choose {0} = {1}", i, ms_quant_stats.binomialCoefficient(6, (uint) i));
            }

            Console.WriteLine("First 10 values of the Poisson (lambda = 1) distribution");
            for (int i = 1; i <= 10; i++)
            {
                Console.WriteLine("\tPoisson(lambda 1, k {0}) = {1}", i, ms_quant_stats.poissonDensity(1, (uint) i));
            }

            Console.WriteLine("Common values of the standard normal distribution");
            foreach (double t in (new double[] {0.001, 0.01, 0.025, 0.05, 0.1, 0.5, 0.9, 0.95, 0.975, 0.99, 0.999})) {
                // Note that normalCriticalValue() takes and argument between 0.5 and 1.0;
                // that is, it returns only the right tail critical values
                double critval;
                if (t < 0.5)
                {
                    critval = -ms_quant_stats.normalCriticalValue(1d - t);
                }
                else
                {
                    critval = ms_quant_stats.normalCriticalValue(t);
                }
                Console.WriteLine("\tNormal-1({0}) = {1}", t, critval);
            }

            foreach (double t in (new double[] {-3.09, -2.32, -1.96, -1.64, -1.28, 0, 1.28, 1.64, 1.96, 2.32, 3.09}))
            {
                // Note that normalCumulativeProbability() takes and argument between 0 and 4.09; that is
                // it reports only the right tail probability
                double p;
                if (t < 0)
                {
                    p = 0.5 - ms_quant_stats.normalCumulativeProbability(-t);
                }
                else
                {
                    p = 0.5 + ms_quant_stats.normalCumulativeProbability(t);
                }
                Console.WriteLine("\tNormal(x <= {0}) = {1:0.######}", t, p);
            }

            Console.WriteLine("Common values of the chi-squared distribution at 10 degrees of freedom:");
            
            // Note the significance levels the chi-square functions support
            foreach (double t in (new double[] {0.001, 0.01, 0.025, 0.05, 0.1})) {
                double lowcrit = ms_quant_stats.chisqLowerCriticalValue(10, t);
                double upcrit = ms_quant_stats.chisqUpperCriticalValue(10, t);
                Console.WriteLine("\tchi-sq(df 10, sigLevel {0}) = <{1}, {2}>", t, lowcrit, upcrit);
            }

            Console.WriteLine("Common values of the Student's t distribution at 10 degrees of freedom:");

            // Note the significance levels the Student's t function supports.
            foreach (double t in (new double[] {0.001, 0.005, 0.01, 0.025, 0.05, 0.1})) 
            {
                Console.WriteLine("\tt(df 10, sigLevel {0}) = {1}", t, ms_quant_stats.studentsCriticalValue(10, t));
            }

            Console.WriteLine("Common values of the F distribution at 10 and 1 degrees of freedom:");

            // Note the significance levels the F function supports
            foreach(double t in (new double[] {0.01, 0.05, 0.1})) 
            {
                Console.WriteLine("\tF(df1 10, df2 1, sifLevel {0}) = {1}", t, ms_quant_stats.snedecorsCriticalValue(10, 1, t));
            }

            // see Sample struct below
            List<Sample> samples = new List<Sample>();

            // uniform data generated with
            // perl -e 'print join "\n", map {+ rand() } 0 .. 0'
            samples.Add(new Sample("uniform(0,1)",
                        new double[] { 0.400313346021907, 0.635154745166318, 0.795328049493161, 0.465079196585926, 0.709667388012424 ,
                        0.76161797692474, 0.617230566355229, 0.282583560369936, 0.251706165020014, 0.2962014901575},
                        0.5));

            // Normal (5, 2.5) data generated in R with
            // > rnorm(10, 5, 2.5)
            samples.Add(new Sample("normal(5, 2.5)",
                new double[] { 3.1476833, 9.2061900, 2.6079399, 4.4361054, 6.4133568, 4.3745899, 3.5588195, 3.0542840, 0.6643241,4.3697818},
                5));

            // Beta (0.5, 0.5) data generated in R with
            // > rbeta(10, 0.5, 0.5)
            samples.Add(new Sample("beta(0.5, 0.5)",
                new double[] { 0.754152463, 0.008992998, 0.221637853, 0.426146895, 0.656421042, 0.224818243, 0.022050868, 0.539143737 ,
                0.877643550 ,0.128030530 },
                0.5));

            Console.WriteLine();
            foreach (Sample sample in samples)
            {
                vectord vec = sample.Data;
                // Note: vectord does not provided a ToArray method as standard.  The ToArray method used
                // here is an extension implemented in the VectordExtensionMethods class (see below)
                Console.WriteLine("Sample '{0}' = [{1}]", sample.SampleName, string.Join(", ", vec.ToArray()));

                // The sorted sample vector is needed for the Shapiro-Wilk W test
                // it also makes finsing the sample median fast
                vectord vecSorted = sample.SortedData;

                // Auxiliary helper functions:
                Console.WriteLine("\tSum = {0:#.#####}", ms_quant_stats.sum(vec));
                Console.WriteLine("\tSum of squares = {0:#.#####}", ms_quant_stats.sumsq(vec));

                // Note: vectord does not provided a ToArray method as standard.  The ToArray method used
                // here is an extension implemented in the VectordExtensionMethods class (see below)
                Console.WriteLine("\tLog transformed = [{0}]", string.Join(", ", ms_quant_stats.logTransform(vec).ToArray()));
                Console.WriteLine("\tExp transformed = [{0}]", string.Join(", ", ms_quant_stats.expTransform(vec).ToArray()));

                // Basic sample statistics
                double xbar = ms_quant_stats.arithmeticMean(vec);
                double stdev = ms_quant_stats.arithmeticStandardDeviation(vec, xbar);
                double median = ms_quant_stats.sortedMedian(vecSorted);
                // If the vector isn't sorted:
                // double median = ms_quant_stats.unsortedMedian(vec);

                double geombar = ms_quant_stats.geometricMean(vec);
                double geomdev = ms_quant_stats.geometricStandardDeviation(vec, xbar);

                Console.WriteLine("\tArithmetic stdev = {0}", stdev);
                Console.WriteLine("\tArithmetic mean = {0}", xbar);
                Console.WriteLine("\tStandard error of mean = {0}", ms_quant_stats.arithmeticStandardErrorOfMean(stdev, (uint)vec.Count));
                Console.WriteLine("\tMedian = {0}", median);
                Console.WriteLine("\tStandard error of median = {0}", ms_quant_stats.arithmeticStandardErrorOfMedian(stdev, (uint)vec.Count));
                Console.WriteLine("\tGeometric mean = {0}", geombar);
                Console.WriteLine("\tGeometric stdev = {0}", geomdev);

                // Basic distribution tests for normally distributed data:
                Console.WriteLine("\tHypothesis: distribution is N({0}, sigma^2) (mean test). p-value = {1:0.#####}", sample.PopulationMean,
                    ms_quant_stats.calculateNormalMeanPvalue(sample.PopulationMean, xbar, stdev, (uint)vec.Count));

                Console.WriteLine("\tHypothesis: distribution is N({0}, sigma^2) (median test). p-value = {1:0.#####}", sample.PopulationMean,
                    ms_quant_stats.calculateNormalMedianPvalue(sample.PopulationMean, median, stdev, (uint)vec.Count));

                // (We would expect the normal sample to have the highest p-value, i.e.
                // least evidence against the hypothesis, while the uniform and beta
                // distributions should have fairly low p-values.  However, sample size is
                // so small that the tests are not very reliable)

                {
                    double W, p;
                    bool ok = ms_quant_stats.testShapiroWilkW(vecSorted, out W, out p);
                    Console.WriteLine("\tShapiro-Wilk ok = {0}, W = {1}, p-value = {2}", ok, W != 0 ? W.ToString() : "(none)",
                        p != 0 ? p.ToString() : "(none)");
                }

                // outlier detection can be done even when the above tests indicate that the
                // sample is not normally distributed - after outlier removal, the sample may
                // be normally distributed!
                {
                    uint numL, numH;
                    ms_quant_stats.detectOutliers(vecSorted, "auto", 0.05, out numL, out numH);

                    // numL and numH are the number of elements at each end of vecSorted
                    // which seem to be outliers.  That is, elements between 0 and numL-1, and
                    // vecSorted.Count-numH to vecSorted.Count-1.

                    Console.WriteLine("\tOutlier detection('auto',0.05) = <{0}, {1}>", numL,
                        numH);
                }

                Console.WriteLine();
            }

        }

        

    }

    
    public static class VectordExtensionMethods
    {
        // vectord doesn't provide a ToArray method, instead providing a CopyTo method for
        // populating an existing double array.  This extension class adds a
        // ToArray helper method to vectord
        public static double[] ToArray(this vectord vector)
        {
            double[] data = new double[vector.Count];
            vector.CopyTo(data);
            return data;
        }
    }

    struct Sample
    {
        private string Name;
        private double[] data;
        private double PopMean;

        public Sample(string name, double[] data, double popmean)
        {
            this.Name = name;
            this.data = data;
            this.PopMean = popmean;
        }

        public string SampleName
        {
            get
            {
                return Name;
            }
        }

        public double PopulationMean
        {
            get
            {
                return PopMean;
            }
        }

        public vectord Data
        {
            get
            {
                return new vectord(data);
            }
        }

        public vectord SortedData
        {
            get
            {
                // maintain the underlying data array unsorted
                double[] dataCopy = new double[data.Length];
                this.Data.CopyTo(dataCopy);
                Array.Sort(dataCopy);
                return new vectord(dataCopy);
            }
        }
    }
}

/*
tools_stats.exe
Will give the following output

Binomial coefficients for sample size 6:
    6 choose 0 = 1
    6 choose 1 = 6
    6 choose 2 = 15
    6 choose 3 = 20
    6 choose 4 = 15
    6 choose 5 = 6
    6 choose 6 = 1
First 10 values of the Poisson (lambda = 1) distribution
    Poisson(lambda 1, k 1) = 0.367879441171442
    Poisson(lambda 1, k 2) = 0.183939720585721
    Poisson(lambda 1, k 3) = 0.0613132401952405
    Poisson(lambda 1, k 4) = 0.0153283100488101
    Poisson(lambda 1, k 5) = 0.00306566200976203
    Poisson(lambda 1, k 6) = 0.00051094366829367
    Poisson(lambda 1, k 7) = 7.29919526133814E-05
    Poisson(lambda 1, k 8) = 9.12399407667265E-06
    Poisson(lambda 1, k 9) = 1.0137771196303E-06
    Poisson(lambda 1, k 10) = 1.01377711963029E-07
Common values of the standard normal distribution
    Normal-1(0.001) = -3.09
    Normal-1(0.01) = -2.318
    Normal-1(0.025) = -1.959
    Normal-1(0.05) = -1.642
    Normal-1(0.1) = -1.282
    Normal-1(0.5) = 0
    Normal-1(0.9) = 1.282
    Normal-1(0.95) = 1.642
    Normal-1(0.975) = 1.959
    Normal-1(0.99) = 2.318
    Normal-1(0.999) = 3.09
    Normal(x <= -3.09) = 0.001
    Normal(x <= -2.32) = 0.00554
    Normal(x <= -1.96) = 0.025
    Normal(x <= -1.64) = 0.0505
    Normal(x <= -1.28) = 0.10027
    Normal(x <= 0) = 0.5
    Normal(x <= 1.28) = 0.89973
    Normal(x <= 1.64) = 0.9495
    Normal(x <= 1.96) = 0.975
    Normal(x <= 2.32) = 0.99446
    Normal(x <= 3.09) = 0.999
Common values of the chi-squared distribution at 10 degrees of freedom:
    chi-sq(df 10, sigLevel 0.001) = <1.479, 29.588>
    chi-sq(df 10, sigLevel 0.01) = <2.558, 23.209>
    chi-sq(df 10, sigLevel 0.025) = <3.247, 20.483>
    chi-sq(df 10, sigLevel 0.05) = <3.94, 18.307>
    chi-sq(df 10, sigLevel 0.1) = <4.865, 15.987>
Common values of the Student's t distribution at 10 degrees of freedom:
    t(df 10, sigLevel 0.001) = 4.143
    t(df 10, sigLevel 0.005) = 3.169
    t(df 10, sigLevel 0.01) = 2.764
    t(df 10, sigLevel 0.025) = 2.228
    t(df 10, sigLevel 0.05) = 1.812
    t(df 10, sigLevel 0.1) = 1.372
Common values of the F distribution at 10 and 1 degrees of freedom:
    F(df1 10, df2 1, sifLevel 0.01) = 6055.85
    F(df1 10, df2 1, sifLevel 0.05) = 241.882
    F(df1 10, df2 1, sifLevel 0.1) = 60.195

Sample 'uniform(0,1)' = [0.400313346021907, 0.635154745166318, 0.795328049493161, 0.465079196585926, 0.709667388012424, 0.76161797692474, 0.617230566355229, 0.282583560369936, 0.251706165020014, 0.2962014901575]
    Sum = 5.21488
    Sum of squares = 3.10813
    Log transformed = [-0.915507673489646, -0.4538866166025, -0.229000608568736, -0.765547572658224, -0.342958886300134, -0.272310191628172, -0.482512635488649, -1.26378098321243, -1.37949288361674, -1.21671534624448]
    Exp transformed = [1.49229222822126, 1.88731417160841, 2.2151675622189, 1.59214027596837, 2.03331484127, 2.14173870243289, 1.85378698617621, 1.32655261769313, 1.28621804590024, 1.34474108162913]
    Arithmetic stdev = 0.207799545425621
    Arithmetic mean = 0.521488248410716
    Standard error of mean = 0.0657119860292586
    Median = 0.541154881470578
    Standard error of median = 0.0773298651592315
    Geometric mean = 0.480863737139541
    Geometric stdev = 1.56255472576704
    Hypothesis: distribution is N(0.5, sigma^2) (mean test). p-value = 0.75114
    Hypothesis: distribution is N(0.5, sigma^2) (median test). p-value = 0.60747
    Shapiro-Wilk ok = True, W = 0.904955259787869, p-value = 0.248103833545881
    Outlier detection('auto',0.05) = <0, 0>

Sample 'normal(5, 2.5)' = [3.1476833, 9.20619, 2.6079399, 4.4361054, 6.4133568, 4.3745899, 3.5588195, 3.054284, 0.6643241, 4.3697818]
    Sum = 41.83307
    Sum of squares = 222.94057
    Log transformed = [1.14666672193647, 2.21987608389596, 0.958560599320351, 1.48977682935013, 1.85838281560642, 1.47581277827312, 1.26942888874853, 1.11654519526921, -0.408985146179298, 1.47471307651077]
    Exp transformed = [23.2820644952793, 9958.58228914508, 13.5710642846326, 84.4454192814535, 609.93768731081, 79.4072679533238, 35.1217114826069, 21.2059965963608, 1.94317668425472, 79.0263862606734]
    Arithmetic stdev = 2.30795630958355
    Arithmetic mean = 4.18330747
    Standard error of mean = 0.729839867844071
    Median = 3.96430065
    Standard error of median = 0.858875556478903
    Geometric mean = 3.52569572037875
    Geometric stdev = 2.04835540793366
    Hypothesis: distribution is N(5, sigma^2) (mean test). p-value = 0.29211
    Hypothesis: distribution is N(5, sigma^2) (median test). p-value = 0.25861
    Shapiro-Wilk ok = True, W = 0.917418368639979, p-value = 0.335925333025062
    Outlier detection('auto',0.05) = <0, 0>

Sample 'beta(0.5, 0.5)' = [0.754152463, 0.008992998, 0.221637853, 0.426146895, 0.656421042, 0.224818243, 0.022050868, 0.539143737, 0.87764355, 0.12803053]
    Sum = 3.85904
    Sum of squares = 2.3588
    Log transformed = [-0.28216072584468, -4.71130900444498, -1.50671052190953, -0.852971168207217, -0.420952863607245, -1.49246301212003, -3.8144033127847, -0.617773070154802, -0.130514747277869, -2.05548652787785]
    Exp transformed = [2.12580905780365, 1.00903355649617, 1.24811929335097, 1.53134570563969, 1.92788017044885, 1.2520951184563, 1.02229578528984, 1.71453813789789, 2.40522523023951, 1.13658770218992]
    Arithmetic stdev = 0.310837187272829
    Arithmetic mean = 0.3859038179
    Standard error of mean = 0.0982953493262443
    Median = 0.325482569
    Standard error of median = 0.115673967087124
    Geometric mean = 0.204236938593202
    Geometric stdev = 5.41293770078999
    Hypothesis: distribution is N(0.5, sigma^2) (mean test). p-value = 0.27561
    Hypothesis: distribution is N(0.5, sigma^2) (median test). p-value = 0.16565
    Shapiro-Wilk ok = True, W = 0.930482521074392, p-value = 0.452655082151213
    Outlier detection('auto',0.05) = <0, 0>


*/
Copyright © 2016 Matrix Science Ltd.  All Rights Reserved. Generated on Fri Jun 2 2017 01:44:50