Matrix Science header

tools_stats.pl

Using statistical routines.

#!/usr/local/bin/perl
##############################################################################
# file: tools_stats.pl                                                       #
# Mascot Parser toolkit example code                                         #
##############################################################################
# COPYRIGHT NOTICE                                                           #
# Copyright 1998-2012 Matrix Science Limited  All Rights Reserved.           #
#                                                                            #
##############################################################################
#     $Source: parser/examples/test_perl/tools_stats.pl $
#     $Author: villek@matrixscience.com $
#       $Date: 2018-07-30 16:23:53 +0100 $
#   $Revision: 1b450440f9c97e1e41d0fc6016a27d68951d4532 | MSPARSER_REL_2_8_1-0-gea32989045 $ 
##############################################################################
use strict;
##############################################################################

use msparser;

print "Binomial coefficients for sample size 6:\n";

for my $i (0 .. 6) {
    printf "\t6 choose %d = %g\n", $i, msparser::ms_quant_stats::binomialCoefficient(6, $i);
}

print "First 10 values of the Poisson(lambda = 1) distribution:\n";

for my $i (1 .. 10) {
    printf "\tPoisson(lambda 1, k %d) = %g\n", $i, msparser::ms_quant_stats::poissonDensity(1, $i);
}

print "Common values of the standard normal distribution:\n";

for (0.001, 0.01, 0.025, 0.05, 0.1, 0.5, 0.9, 0.95, 0.975, 0.99, 0.999) {
    # Note that normalCriticalValue() takes an argument between 0.5 and 1.0;
    # that is, it returns only the right tail critical values.
    my $critval;
    
    if ($_ < 0.5) {
        $critval = -msparser::ms_quant_stats::normalCriticalValue(1 - $_);
    } else {
        $critval = msparser::ms_quant_stats::normalCriticalValue($_);
    }

    printf "\tNormal-1(%g) = %g\n", $_, $critval;
}

for (-3.09, -2.32, -1.96, -1.64, -1.28, 0, 1.28, 1.64, 1.96, 2.32, 3.09) {
    # Note that normalCumulativeProbability() takes an argument between 0 and 
    # 4.09; that is, it reports only the right tail probability.
    my $p;

    if ($_ < 0) {
        $p = 0.5 - msparser::ms_quant_stats::normalCumulativeProbability(-$_);
    } else {
        $p = 0.5 + msparser::ms_quant_stats::normalCumulativeProbability($_);
    }

    printf "\tNormal(x <= %g) = %g\n", $_, $p;
}

print "Common values of the chi-square distribution at 10 degrees of freedom:\n";

# Note the significance levels the chi-square functions support.
for (0.001, 0.01, 0.025, 0.05, 0.1) {
    my $lowcrit = msparser::ms_quant_stats::chisqLowerCriticalValue(10, $_);
    my $upcrit = msparser::ms_quant_stats::chisqUpperCriticalValue(10, $_);

    printf "\tchi-sq(df 10, sigLevel %g) = <%g, %g>\n", $_, $lowcrit, $upcrit;
}

print "Common values of the Student's t distribution at 10 degrees of freedom:\n";

# Note the significance levels the Student's t function supports.
for (0.001, 0.005, 0.01, 0.025, 0.05, 0.1) {
    printf "\tt(df 10, sigLevel %g) = %g\n", 
        $_,
        msparser::ms_quant_stats::studentsCriticalValue(10, $_);
}

print "Common values of the F distribution at 10 and 1 degrees of freedom:\n";

# Note the significance levels the F function supports.
for (0.01, 0.05, 0.1) {
    printf "\tF(df1 10, df2 1, sigLevel %g) = %g\n", 
        $_,
        msparser::ms_quant_stats::snedecorsCriticalValue(10, 1, $_);
}

my @samples = (
    # Uniform data generated with 
    #   perl -e 'print join "\n", map {+ rand() } 0 .. 9'
    { name => 'uniform(0,1)', popmean => 0.5, data => [split /\s+/, <<EOD] },
0.400313346021907
0.635154745166318
0.795328049493161
0.465079196585926
0.709667388012424
0.76161797692474
0.617230566355229
0.282583560369936
0.251706165020014
0.2962014901575
EOD
    # Normal(5, 2.5) data generated in R with
    # > rnorm(10, 5, 2.5)
    { name => 'normal(5,2.5)', popmean => 5, data => [split /\s+/, <<EOD] },
3.1476833 
9.2061900 
2.6079399 
4.4361054 
6.4133568 
4.3745899 
3.5588195
3.0542840 
0.6643241 
4.3697818
EOD
    # Beta(0.5, 0.5) data generated in R with
    # > rbeta(10, 0.5, 0.5)
    { name => 'beta(0.5,0.5)', popmean => 0.5, data => [split /\s+/, <<EOD] },
0.754152463 
0.008992998 
0.221637853 
0.426146895 
0.656421042 
0.224818243
0.022050868 
0.539143737 
0.877643550 
0.128030530
EOD
);

print "\n";

for my $sample (@samples) {
    printf "Sample '%s' = [%s]\n", $sample->{'name'}, join(', ', @{ $$sample{'data'} });

    # ms_quant_stats takes as input STL vectors of type double. We can easily
    # convert a Perl array into one:
    my $vec = msparser::vectord->new(scalar(@{ $$sample{'data'} }));
    
    for (0 .. $#{ $$sample{'data'} }) {
        $vec->set($_, $$sample{'data'}[$_]);
    }

    # The sorted sample vector is needed for the Shapiro-Wilk W test.
    # It also makes finding the sample median fast.
    my $vec_sorted;
    
    do {
        my @sorted = sort { $a <=> $b } @{ $$sample{'data'} };
        $vec_sorted = msparser::vectord->new(scalar(@sorted));
        
        for (0 .. $#{ $$sample{'data'} }) {
            $vec_sorted->set($_, $sorted[$_]);
        }
    };

    # Auxiliary helper functions:

    printf "\tSum = %g\n", msparser::ms_quant_stats::sum($vec);
    printf "\tSum of squares = %g\n", msparser::ms_quant_stats::sumsq($vec);
    printf "\tLog transformed = [%s]\n", join(', ', @{ msparser::ms_quant_stats::logTransform($vec) });
    printf "\tExp transformed = [%s]\n", join(', ', @{ msparser::ms_quant_stats::expTransform($vec) });

    # Basic sample statistics:

    my $xbar = msparser::ms_quant_stats::arithmeticMean($vec);
    my $stdev = msparser::ms_quant_stats::arithmeticStandardDeviation($vec, $xbar);
    my $median = msparser::ms_quant_stats::sortedMedian($vec_sorted);
    # If the vector isn't sorted:
    # my $median = msparser::ms_quant_stats::unsortedMedian($vec);

    my $geombar = msparser::ms_quant_stats::geometricMean($vec);
    my $geomdev = msparser::ms_quant_stats::geometricStandardDeviation($vec, $geombar);

    printf "\tArithmetic stdev = %g\n", $stdev;
    printf "\tArithmetic mean = %g\n", $xbar;
    printf "\tStandard error of mean = %g\n", msparser::ms_quant_stats::arithmeticStandardErrorOfMean($stdev, $vec->size());
    printf "\tMedian = %g\n", $median;
    printf "\tStandard error of median = %g\n", msparser::ms_quant_stats::arithmeticStandardErrorOfMedian($stdev, $vec->size());
    printf "\tGeometric mean = %g\n", $geombar;
    printf "\tGeometric stdev = %g\n", $geomdev;

    # Basic distribution tests for normally distributed data:

    printf "\tHypothesis: distribution is N(%g, sigma^2) (mean test). p-value = %g\n", 
        $$sample{'popmean'}, 
        msparser::ms_quant_stats::calculateNormalMeanPvalue($$sample{'popmean'}, $xbar, $stdev, $vec->size());

    printf "\tHypothesis: distribution is N(%g, sigma^2) (median test). p-value = %g\n", 
        $$sample{'popmean'}, 
        msparser::ms_quant_stats::calculateNormalMedianPvalue($$sample{'popmean'}, $median, $stdev, $vec->size());

    # (We would expect the normal sample to have the highest p-value, i.e.
    # least evidence against the hypothesis, while the uniform and beta
    # distribution should have fairly low p-values. However, sample size is
    # so small that the tests are not very reliable.)

    do {
        my ($ok, $W, $p) = msparser::ms_quant_stats::testShapiroWilkW($vec_sorted);
        printf "\tShapiro-Wilk ok = %d, W = %s, p-value = %s\n", $ok, (defined $W ? $W : '(none)'), (defined $p ? $p : '(none)');
    };

    # Outlier detection can be done even when the above tests indicate that
    # the sample is not normally distributed. However, don't trust the results!

    do {
        # Note that the input vector must be sorted for outlier testing to
        # work.
        my ($numL, $numH) = msparser::ms_quant_stats::detectOutliers($vec_sorted, "auto", 0.05);

        # $numL and $numH are the number of elements at each end of $vec_sorted
        # which seem to be outliers. That is, elements between 0 and $numL-1,
        # and $vec_sorted->size()-$numH to $vec_sorted->size()-1.

        printf "\tOutlier detection('auto', 0.05') = <%s, %s>\n", (defined $numL ? $numL : '(none)'), (defined $numH ? $numH : '(none)');
    };

    print "\n";
}

=pod

Running the program as

perl -I../bin tools_stats.pl

will give the following output under Mascot 2.5:

Binomial coefficients for sample size 6:
    6 choose 0 = 1
    6 choose 1 = 6
    6 choose 2 = 15
    6 choose 3 = 20
    6 choose 4 = 15
    6 choose 5 = 6
    6 choose 6 = 1
First 10 values of the Poisson(lambda = 1) distribution:
    Poisson(lambda 1, k 1) = 0.367879
    Poisson(lambda 1, k 2) = 0.18394
    Poisson(lambda 1, k 3) = 0.0613132
    Poisson(lambda 1, k 4) = 0.0153283
    Poisson(lambda 1, k 5) = 0.00306566
    Poisson(lambda 1, k 6) = 0.000510944
    Poisson(lambda 1, k 7) = 7.2992e-05
    Poisson(lambda 1, k 8) = 9.12399e-06
    Poisson(lambda 1, k 9) = 1.01378e-06
    Poisson(lambda 1, k 10) = 1.01378e-07
Common values of the standard normal distribution:
    Normal-1(0.001) = -3.09
    Normal-1(0.01) = -2.318
    Normal-1(0.025) = -1.959
    Normal-1(0.05) = -1.642
    Normal-1(0.1) = -1.282
    Normal-1(0.5) = 0
    Normal-1(0.9) = 1.282
    Normal-1(0.95) = 1.642
    Normal-1(0.975) = 1.959
    Normal-1(0.99) = 2.318
    Normal-1(0.999) = 3.09
    Normal(x <= -3.09) = 0.001
    Normal(x <= -2.32) = 0.00554
    Normal(x <= -1.96) = 0.025
    Normal(x <= -1.64) = 0.0505
    Normal(x <= -1.28) = 0.10027
    Normal(x <= 0) = 0.5
    Normal(x <= 1.28) = 0.89973
    Normal(x <= 1.64) = 0.9495
    Normal(x <= 1.96) = 0.975
    Normal(x <= 2.32) = 0.99446
    Normal(x <= 3.09) = 0.999
Common values of the chi-square distribution at 10 degrees of freedom:
    chi-sq(df 10, sigLevel 0.001) = <1.479, 29.588>
    chi-sq(df 10, sigLevel 0.01) = <2.558, 23.209>
    chi-sq(df 10, sigLevel 0.025) = <3.247, 20.483>
    chi-sq(df 10, sigLevel 0.05) = <3.94, 18.307>
    chi-sq(df 10, sigLevel 0.1) = <4.865, 15.987>
Common values of the Student's t distribution at 10 degrees of freedom:
    t(df 10, sigLevel 0.001) = 4.143
    t(df 10, sigLevel 0.005) = 3.169
    t(df 10, sigLevel 0.01) = 2.764
    t(df 10, sigLevel 0.025) = 2.228
    t(df 10, sigLevel 0.05) = 1.812
    t(df 10, sigLevel 0.1) = 1.372
Common values of the F distribution at 10 and 1 degrees of freedom:
    F(df1 10, df2 1, sigLevel 0.01) = 6055.85
    F(df1 10, df2 1, sigLevel 0.05) = 241.882
    F(df1 10, df2 1, sigLevel 0.1) = 60.195

Sample 'uniform(0,1)' = [0.400313346021907, 0.635154745166318, 0.795328049493161, 0.465079196585926, 0.709667388012424, 0.76161797692474, 0.617230566355229, 0.282583560369936, 0.251706165020014, 0.2962014901575]
    Sum = 5.21488
    Sum of squares = 3.10813
    Log transformed = [-0.915507673489646, -0.4538866166025, -0.229000608568736, -0.765547572658224, -0.342958886300134, -0.272310191628172, -0.482512635488649, -1.26378098321243, -1.37949288361674, -1.21671534624448]
    Exp transformed = [1.49229222822126, 1.88731417160841, 2.2151675622189, 1.59214027596837, 2.03331484127, 2.14173870243289, 1.85378698617621, 1.32655261769313, 1.28621804590024, 1.34474108162913]
    Arithmetic stdev = 0.2078
    Arithmetic mean = 0.521488
    Standard error of mean = 0.065712
    Median = 0.541155
    Standard error of median = 0.0773299
    Geometric mean = 0.480864
    Geometric stdev = 1.54969
    Hypothesis: distribution is N(0.5, sigma^2) (mean test). p-value = 0.75114
    Hypothesis: distribution is N(0.5, sigma^2) (median test). p-value = 0.607474
    Shapiro-Wilk ok = 1, W = 0.904955259787869, p-value = 0.248103833545881
    Outlier detection('auto', 0.05') = <0, 0>

Sample 'normal(5,2.5)' = [3.1476833, 9.2061900, 2.6079399, 4.4361054, 6.4133568, 4.3745899, 3.5588195, 3.0542840, 0.6643241, 4.3697818]
    Sum = 41.8331
    Sum of squares = 222.941
    Log transformed = [1.14666672193647, 2.21987608389596, 0.958560599320351, 1.48977682935013, 1.85838281560642, 1.47581277827312, 1.26942888874853, 1.11654519526921, -0.408985146179298, 1.47471307651077]
    Exp transformed = [23.2820644952793, 9958.58228914508, 13.5710642846326, 84.4454192814535, 609.93768731081, 79.4072679533238, 35.1217114826069, 21.2059965963608, 1.94317668425472, 79.0263862606734]
    Arithmetic stdev = 2.30796
    Arithmetic mean = 4.18331
    Standard error of mean = 0.72984
    Median = 3.9643
    Standard error of median = 0.858876
    Geometric mean = 3.5257
    Geometric stdev = 2.00172
    Hypothesis: distribution is N(5, sigma^2) (mean test). p-value = 0.292114
    Hypothesis: distribution is N(5, sigma^2) (median test). p-value = 0.258609
    Shapiro-Wilk ok = 1, W = 0.917418368639979, p-value = 0.335925333025062
    Outlier detection('auto', 0.05') = <0, 0>

Sample 'beta(0.5,0.5)' = [0.754152463, 0.008992998, 0.221637853, 0.426146895, 0.656421042, 0.224818243, 0.022050868, 0.539143737, 0.877643550, 0.128030530]
    Sum = 3.85904
    Sum of squares = 2.3588
    Log transformed = [-0.28216072584468, -4.71130900444498, -1.50671052190953, -0.852971168207217, -0.420952863607245, -1.49246301212003, -3.8144033127847, -0.617773070154802, -0.130514747277869, -2.05548652787785]
    Exp transformed = [2.12580905780365, 1.00903355649617, 1.24811929335097, 1.53134570563969, 1.92788017044885, 1.2520951184563, 1.02229578528984, 1.71453813789789, 2.40522523023951, 1.13658770218992]
    Arithmetic stdev = 0.310837
    Arithmetic mean = 0.385904
    Standard error of mean = 0.0982953
    Median = 0.325483
    Standard error of median = 0.115674
    Geometric mean = 0.204237
    Geometric stdev = 4.71093
    Hypothesis: distribution is N(0.5, sigma^2) (mean test). p-value = 0.275606
    Hypothesis: distribution is N(0.5, sigma^2) (median test). p-value = 0.165651
    Shapiro-Wilk ok = 1, W = 0.930482521074392, p-value = 0.452655082151213
    Outlier detection('auto', 0.05') = <0, 0>

=cut

Copyright © 2022 Matrix Science Ltd.  All Rights Reserved. Generated on Thu Mar 31 2022 01:12:29