msparser/help/tools_stats_8pl-example.html

#!/usr/local/bin/perl

##############################################################################

# file: tools_stats.pl                                                       #

# Mascot Parser toolkit example code                                         #

##############################################################################

# COPYRIGHT NOTICE                                                           #

# Copyright 1998-2012 Matrix Science Limited  All Rights Reserved.           #

#                                                                            #

##############################################################################

#     $Source: parser/examples/test_perl/tools_stats.pl $

#     $Author: villek@matrixscience.com $

#       $Date: 2018-07-30 16:23:53 +0100 $

#   $Revision: 1b450440f9c97e1e41d0fc6016a27d68951d4532 | MSPARSER_REL_3_1_0-2025-07-27-0-gea47708fac $

##############################################################################

use strict;

##############################################################################


use msparser;


print "Binomial coefficients for sample size 6:\n";


for my $i (0 .. 6) {

    printf "\t6 choose %d = %g\n", $i, msparser::ms_quant_stats::binomialCoefficient(6, $i);

}


print "First 10 values of the Poisson(lambda = 1) distribution:\n";


for my $i (1 .. 10) {

    printf "\tPoisson(lambda 1, k %d) = %g\n", $i, msparser::ms_quant_stats::poissonDensity(1, $i);

}


print "Common values of the standard normal distribution:\n";


for (0.001, 0.01, 0.025, 0.05, 0.1, 0.5, 0.9, 0.95, 0.975, 0.99, 0.999) {

    # Note that normalCriticalValue() takes an argument between 0.5 and 1.0;

    # that is, it returns only the right tail critical values.

    my $critval;


    if ($_ < 0.5) {

        $critval = -msparser::ms_quant_stats::normalCriticalValue(1 - $_);

    } else {

        $critval = msparser::ms_quant_stats::normalCriticalValue($_);

    }


    printf "\tNormal-1(%g) = %g\n", $_, $critval;

}


for (-3.09, -2.32, -1.96, -1.64, -1.28, 0, 1.28, 1.64, 1.96, 2.32, 3.09) {

    # Note that normalCumulativeProbability() takes an argument between 0 and

    # 4.09; that is, it reports only the right tail probability.

    my $p;


    if ($_ < 0) {

        $p = 0.5 - msparser::ms_quant_stats::normalCumulativeProbability(-$_);

    } else {

        $p = 0.5 + msparser::ms_quant_stats::normalCumulativeProbability($_);

    }


    printf "\tNormal(x <= %g) = %g\n", $_, $p;

}


print "Common values of the chi-square distribution at 10 degrees of freedom:\n";


# Note the significance levels the chi-square functions support.

for (0.001, 0.01, 0.025, 0.05, 0.1) {

    my $lowcrit = msparser::ms_quant_stats::chisqLowerCriticalValue(10, $_);

    my $upcrit = msparser::ms_quant_stats::chisqUpperCriticalValue(10, $_);


    printf "\tchi-sq(df 10, sigLevel %g) = <%g, %g>\n", $_, $lowcrit, $upcrit;

}


print "Common values of the Student's t distribution at 10 degrees of freedom:\n";


# Note the significance levels the Student's t function supports.

for (0.001, 0.005, 0.01, 0.025, 0.05, 0.1) {

    printf "\tt(df 10, sigLevel %g) = %g\n",

        $_,

        msparser::ms_quant_stats::studentsCriticalValue(10, $_);

}


print "Common values of the F distribution at 10 and 1 degrees of freedom:\n";


# Note the significance levels the F function supports.

for (0.01, 0.05, 0.1) {

    printf "\tF(df1 10, df2 1, sigLevel %g) = %g\n",

        $_,

        msparser::ms_quant_stats::snedecorsCriticalValue(10, 1, $_);

}


my @samples = (

    # Uniform data generated with

    #   perl -e 'print join "\n", map {+ rand() } 0 .. 9'

    { name => 'uniform(0,1)', popmean => 0.5, data => [split /\s+/, <<EOD] },

0.400313346021907

0.635154745166318

0.795328049493161

0.465079196585926

0.709667388012424

0.76161797692474

0.617230566355229

0.282583560369936

0.251706165020014

0.2962014901575

EOD

    # Normal(5, 2.5) data generated in R with

    # > rnorm(10, 5, 2.5)

    { name => 'normal(5,2.5)', popmean => 5, data => [split /\s+/, <<EOD] },

3.1476833

9.2061900

2.6079399

4.4361054

6.4133568

4.3745899

3.5588195

3.0542840

0.6643241

4.3697818

EOD

    # Beta(0.5, 0.5) data generated in R with

    # > rbeta(10, 0.5, 0.5)

    { name => 'beta(0.5,0.5)', popmean => 0.5, data => [split /\s+/, <<EOD] },

0.754152463

0.008992998

0.221637853

0.426146895

0.656421042

0.224818243

0.022050868

0.539143737

0.877643550

0.128030530

EOD

);


print "\n";


for my $sample (@samples) {

    printf "Sample '%s' = [%s]\n", $sample->{'name'}, join(', ', @{ $$sample{'data'} });


    # ms_quant_stats takes as input STL vectors of type double. We can easily

    # convert a Perl array into one:

    my $vec = msparser::vectord->new(scalar(@{ $$sample{'data'} }));


    for (0 .. $#{ $$sample{'data'} }) {

        $vec->set($_, $$sample{'data'}[$_]);

    }


    # The sorted sample vector is needed for the Shapiro-Wilk W test.

    # It also makes finding the sample median fast.

    my $vec_sorted;


    do {

        my @sorted = sort { $a <=> $b } @{ $$sample{'data'} };

        $vec_sorted = msparser::vectord->new(scalar(@sorted));


        for (0 .. $#{ $$sample{'data'} }) {

            $vec_sorted->set($_, $sorted[$_]);

        }

    };


    # Auxiliary helper functions:


    printf "\tSum = %g\n", msparser::ms_quant_stats::sum($vec);

    printf "\tSum of squares = %g\n", msparser::ms_quant_stats::sumsq($vec);

    printf "\tLog transformed = [%s]\n", join(', ', @{ msparser::ms_quant_stats::logTransform($vec) });

    printf "\tExp transformed = [%s]\n", join(', ', @{ msparser::ms_quant_stats::expTransform($vec) });


    # Basic sample statistics:


    my $xbar = msparser::ms_quant_stats::arithmeticMean($vec);

    my $stdev = msparser::ms_quant_stats::arithmeticStandardDeviation($vec, $xbar);

    my $median = msparser::ms_quant_stats::sortedMedian($vec_sorted);

    # If the vector isn't sorted:

    # my $median = msparser::ms_quant_stats::unsortedMedian($vec);


    my $geombar = msparser::ms_quant_stats::geometricMean($vec);

    my $geomdev = msparser::ms_quant_stats::geometricStandardDeviation($vec, $geombar);


    printf "\tArithmetic stdev = %g\n", $stdev;

    printf "\tArithmetic mean = %g\n", $xbar;

    printf "\tStandard error of mean = %g\n", msparser::ms_quant_stats::arithmeticStandardErrorOfMean($stdev, $vec->size());

    printf "\tMedian = %g\n", $median;

    printf "\tStandard error of median = %g\n", msparser::ms_quant_stats::arithmeticStandardErrorOfMedian($stdev, $vec->size());

    printf "\tGeometric mean = %g\n", $geombar;

    printf "\tGeometric stdev = %g\n", $geomdev;


    # Basic distribution tests for normally distributed data:


    printf "\tHypothesis: distribution is N(%g, sigma^2) (mean test). p-value = %g\n",

        $$sample{'popmean'},

        msparser::ms_quant_stats::calculateNormalMeanPvalue($$sample{'popmean'}, $xbar, $stdev, $vec->size());


    printf "\tHypothesis: distribution is N(%g, sigma^2) (median test). p-value = %g\n",

        $$sample{'popmean'},

        msparser::ms_quant_stats::calculateNormalMedianPvalue($$sample{'popmean'}, $median, $stdev, $vec->size());


    # (We would expect the normal sample to have the highest p-value, i.e.

    # least evidence against the hypothesis, while the uniform and beta

    # distribution should have fairly low p-values. However, sample size is

    # so small that the tests are not very reliable.)


    do {

        my ($ok, $W, $p) = msparser::ms_quant_stats::testShapiroWilkW($vec_sorted);

        printf "\tShapiro-Wilk ok = %d, W = %s, p-value = %s\n", $ok, (defined $W ? $W : '(none)'), (defined $p ? $p : '(none)');

    };


    # Outlier detection can be done even when the above tests indicate that

    # the sample is not normally distributed. However, don't trust the results!


    do {

        # Note that the input vector must be sorted for outlier testing to

        # work.

        my ($numL, $numH) = msparser::ms_quant_stats::detectOutliers($vec_sorted, "auto", 0.05);


        # $numL and $numH are the number of elements at each end of $vec_sorted

        # which seem to be outliers. That is, elements between 0 and $numL-1,

        # and $vec_sorted->size()-$numH to $vec_sorted->size()-1.


        printf "\tOutlier detection('auto', 0.05') = <%s, %s>\n", (defined $numL ? $numL : '(none)'), (defined $numH ? $numH : '(none)');

    };


    print "\n";

}


=pod


Running the program as


perl -I../bin tools_stats.pl


will give the following output under Mascot 2.5:


Binomial coefficients for sample size 6:

    6 choose 0 = 1

    6 choose 1 = 6

    6 choose 2 = 15

    6 choose 3 = 20

    6 choose 4 = 15

    6 choose 5 = 6

    6 choose 6 = 1

First 10 values of the Poisson(lambda = 1) distribution:

    Poisson(lambda 1, k 1) = 0.367879

    Poisson(lambda 1, k 2) = 0.18394

    Poisson(lambda 1, k 3) = 0.0613132

    Poisson(lambda 1, k 4) = 0.0153283

    Poisson(lambda 1, k 5) = 0.00306566

    Poisson(lambda 1, k 6) = 0.000510944

    Poisson(lambda 1, k 7) = 7.2992e-05

    Poisson(lambda 1, k 8) = 9.12399e-06

    Poisson(lambda 1, k 9) = 1.01378e-06

    Poisson(lambda 1, k 10) = 1.01378e-07

Common values of the standard normal distribution:

    Normal-1(0.001) = -3.09

    Normal-1(0.01) = -2.318

    Normal-1(0.025) = -1.959

    Normal-1(0.05) = -1.642

    Normal-1(0.1) = -1.282

    Normal-1(0.5) = 0

    Normal-1(0.9) = 1.282

    Normal-1(0.95) = 1.642

    Normal-1(0.975) = 1.959

    Normal-1(0.99) = 2.318

    Normal-1(0.999) = 3.09

    Normal(x <= -3.09) = 0.001

    Normal(x <= -2.32) = 0.00554

    Normal(x <= -1.96) = 0.025

    Normal(x <= -1.64) = 0.0505

    Normal(x <= -1.28) = 0.10027

    Normal(x <= 0) = 0.5

    Normal(x <= 1.28) = 0.89973

    Normal(x <= 1.64) = 0.9495

    Normal(x <= 1.96) = 0.975

    Normal(x <= 2.32) = 0.99446

    Normal(x <= 3.09) = 0.999

Common values of the chi-square distribution at 10 degrees of freedom:

    chi-sq(df 10, sigLevel 0.001) = <1.479, 29.588>

    chi-sq(df 10, sigLevel 0.01) = <2.558, 23.209>

    chi-sq(df 10, sigLevel 0.025) = <3.247, 20.483>

    chi-sq(df 10, sigLevel 0.05) = <3.94, 18.307>

    chi-sq(df 10, sigLevel 0.1) = <4.865, 15.987>

Common values of the Student's t distribution at 10 degrees of freedom:

    t(df 10, sigLevel 0.001) = 4.143

    t(df 10, sigLevel 0.005) = 3.169

    t(df 10, sigLevel 0.01) = 2.764

    t(df 10, sigLevel 0.025) = 2.228

    t(df 10, sigLevel 0.05) = 1.812

    t(df 10, sigLevel 0.1) = 1.372

Common values of the F distribution at 10 and 1 degrees of freedom:

    F(df1 10, df2 1, sigLevel 0.01) = 6055.85

    F(df1 10, df2 1, sigLevel 0.05) = 241.882

    F(df1 10, df2 1, sigLevel 0.1) = 60.195


Sample 'uniform(0,1)' = [0.400313346021907, 0.635154745166318, 0.795328049493161, 0.465079196585926, 0.709667388012424, 0.76161797692474, 0.617230566355229, 0.282583560369936, 0.251706165020014, 0.2962014901575]

    Sum = 5.21488

    Sum of squares = 3.10813

    Log transformed = [-0.915507673489646, -0.4538866166025, -0.229000608568736, -0.765547572658224, -0.342958886300134, -0.272310191628172, -0.482512635488649, -1.26378098321243, -1.37949288361674, -1.21671534624448]

    Exp transformed = [1.49229222822126, 1.88731417160841, 2.2151675622189, 1.59214027596837, 2.03331484127, 2.14173870243289, 1.85378698617621, 1.32655261769313, 1.28621804590024, 1.34474108162913]

    Arithmetic stdev = 0.2078

    Arithmetic mean = 0.521488

    Standard error of mean = 0.065712

    Median = 0.541155

    Standard error of median = 0.0773299

    Geometric mean = 0.480864

    Geometric stdev = 1.54969

    Hypothesis: distribution is N(0.5, sigma^2) (mean test). p-value = 0.75114

    Hypothesis: distribution is N(0.5, sigma^2) (median test). p-value = 0.607474

    Shapiro-Wilk ok = 1, W = 0.904955259787869, p-value = 0.248103833545881

    Outlier detection('auto', 0.05') = <0, 0>


Sample 'normal(5,2.5)' = [3.1476833, 9.2061900, 2.6079399, 4.4361054, 6.4133568, 4.3745899, 3.5588195, 3.0542840, 0.6643241, 4.3697818]

    Sum = 41.8331

    Sum of squares = 222.941

    Log transformed = [1.14666672193647, 2.21987608389596, 0.958560599320351, 1.48977682935013, 1.85838281560642, 1.47581277827312, 1.26942888874853, 1.11654519526921, -0.408985146179298, 1.47471307651077]

    Exp transformed = [23.2820644952793, 9958.58228914508, 13.5710642846326, 84.4454192814535, 609.93768731081, 79.4072679533238, 35.1217114826069, 21.2059965963608, 1.94317668425472, 79.0263862606734]

    Arithmetic stdev = 2.30796

    Arithmetic mean = 4.18331

    Standard error of mean = 0.72984

    Median = 3.9643

    Standard error of median = 0.858876

    Geometric mean = 3.5257

    Geometric stdev = 2.00172

    Hypothesis: distribution is N(5, sigma^2) (mean test). p-value = 0.292114

    Hypothesis: distribution is N(5, sigma^2) (median test). p-value = 0.258609

    Shapiro-Wilk ok = 1, W = 0.917418368639979, p-value = 0.335925333025062

    Outlier detection('auto', 0.05') = <0, 0>


Sample 'beta(0.5,0.5)' = [0.754152463, 0.008992998, 0.221637853, 0.426146895, 0.656421042, 0.224818243, 0.022050868, 0.539143737, 0.877643550, 0.128030530]

    Sum = 3.85904

    Sum of squares = 2.3588

    Log transformed = [-0.28216072584468, -4.71130900444498, -1.50671052190953, -0.852971168207217, -0.420952863607245, -1.49246301212003, -3.8144033127847, -0.617773070154802, -0.130514747277869, -2.05548652787785]

    Exp transformed = [2.12580905780365, 1.00903355649617, 1.24811929335097, 1.53134570563969, 1.92788017044885, 1.2520951184563, 1.02229578528984, 1.71453813789789, 2.40522523023951, 1.13658770218992]

    Arithmetic stdev = 0.310837

    Arithmetic mean = 0.385904

    Standard error of mean = 0.0982953

    Median = 0.325483

    Standard error of median = 0.115674

    Geometric mean = 0.204237

    Geometric stdev = 4.71093

    Hypothesis: distribution is N(0.5, sigma^2) (mean test). p-value = 0.275606

    Hypothesis: distribution is N(0.5, sigma^2) (median test). p-value = 0.165651

    Shapiro-Wilk ok = 1, W = 0.930482521074392, p-value = 0.452655082151213

    Outlier detection('auto', 0.05') = <0, 0>


=cut