# tools_stats.pl

Using statistical routines.

```#!/usr/local/bin/perl
##############################################################################
# file: tools_stats.pl                                                       #
# Mascot Parser toolkit example code                                         #
##############################################################################
#                                                                            #
##############################################################################
#     \$Source: /vol/cvsroot/parser/examples/test_perl/tools_stats.pl,v \$
#     \$Author: villek \$
#       \$Date: 2013/12/02 16:14:28 \$
#   \$Revision: 1.3 \$
##############################################################################
use strict;
##############################################################################

use msparser;

print "Binomial coefficients for sample size 6:\n";

for my \$i (0 .. 6) {
printf "\t6 choose %d = %g\n", \$i, msparser::ms_quant_stats::binomialCoefficient(6, \$i);
}

print "First 10 values of the Poisson(lambda = 1) distribution:\n";

for my \$i (1 .. 10) {
printf "\tPoisson(lambda 1, k %d) = %g\n", \$i, msparser::ms_quant_stats::poissonDensity(1, \$i);
}

print "Common values of the standard normal distribution:\n";

for (0.001, 0.01, 0.025, 0.05, 0.1, 0.5, 0.9, 0.95, 0.975, 0.99, 0.999) {
# Note that normalCriticalValue() takes an argument between 0.5 and 1.0;
# that is, it returns only the right tail critical values.
my \$critval;

if (\$_ < 0.5) {
\$critval = -msparser::ms_quant_stats::normalCriticalValue(1 - \$_);
} else {
\$critval = msparser::ms_quant_stats::normalCriticalValue(\$_);
}

printf "\tNormal-1(%g) = %g\n", \$_, \$critval;
}

for (-3.09, -2.32, -1.96, -1.64, -1.28, 0, 1.28, 1.64, 1.96, 2.32, 3.09) {
# Note that normalCumulativeProbability() takes an argument between 0 and
# 4.09; that is, it reports only the right tail probability.
my \$p;

if (\$_ < 0) {
\$p = 0.5 - msparser::ms_quant_stats::normalCumulativeProbability(-\$_);
} else {
\$p = 0.5 + msparser::ms_quant_stats::normalCumulativeProbability(\$_);
}

printf "\tNormal(x <= %g) = %g\n", \$_, \$p;
}

print "Common values of the chi-square distribution at 10 degrees of freedom:\n";

# Note the significance levels the chi-square functions support.
for (0.001, 0.01, 0.025, 0.05, 0.1) {
my \$lowcrit = msparser::ms_quant_stats::chisqLowerCriticalValue(10, \$_);
my \$upcrit = msparser::ms_quant_stats::chisqUpperCriticalValue(10, \$_);

printf "\tchi-sq(df 10, sigLevel %g) = <%g, %g>\n", \$_, \$lowcrit, \$upcrit;
}

print "Common values of the Student's t distribution at 10 degrees of freedom:\n";

# Note the significance levels the Student's t function supports.
for (0.001, 0.005, 0.01, 0.025, 0.05, 0.1) {
printf "\tt(df 10, sigLevel %g) = %g\n",
\$_,
msparser::ms_quant_stats::studentsCriticalValue(10, \$_);
}

print "Common values of the F distribution at 10 and 1 degrees of freedom:\n";

# Note the significance levels the F function supports.
for (0.01, 0.05, 0.1) {
printf "\tF(df1 10, df2 1, sigLevel %g) = %g\n",
\$_,
msparser::ms_quant_stats::snedecorsCriticalValue(10, 1, \$_);
}

my @samples = (
# Uniform data generated with
#   perl -e 'print join "\n", map {+ rand() } 0 .. 9'
{ name => 'uniform(0,1)', popmean => 0.5, data => [split /\s+/, <<EOD] },
0.400313346021907
0.635154745166318
0.795328049493161
0.465079196585926
0.709667388012424
0.76161797692474
0.617230566355229
0.282583560369936
0.251706165020014
0.2962014901575
EOD
# Normal(5, 2.5) data generated in R with
# > rnorm(10, 5, 2.5)
{ name => 'normal(5,2.5)', popmean => 5, data => [split /\s+/, <<EOD] },
3.1476833
9.2061900
2.6079399
4.4361054
6.4133568
4.3745899
3.5588195
3.0542840
0.6643241
4.3697818
EOD
# Beta(0.5, 0.5) data generated in R with
# > rbeta(10, 0.5, 0.5)
{ name => 'beta(0.5,0.5)', popmean => 0.5, data => [split /\s+/, <<EOD] },
0.754152463
0.008992998
0.221637853
0.426146895
0.656421042
0.224818243
0.022050868
0.539143737
0.877643550
0.128030530
EOD
);

print "\n";

for my \$sample (@samples) {
printf "Sample '%s' = [%s]\n", \$sample->{'name'}, join(', ', @{ \$\$sample{'data'} });

# ms_quant_stats takes as input STL vectors of type double. We can easily
# convert a Perl array into one:
my \$vec = msparser::vectord->new(scalar(@{ \$\$sample{'data'} }));

for (0 .. \$#{ \$\$sample{'data'} }) {
\$vec->set(\$_, \$\$sample{'data'}[\$_]);
}

# The sorted sample vector is needed for the Shapiro-Wilk W test.
# It also makes finding the sample median fast.
my \$vec_sorted;

do {
my @sorted = sort { \$a <=> \$b } @{ \$\$sample{'data'} };
\$vec_sorted = msparser::vectord->new(scalar(@sorted));

for (0 .. \$#{ \$\$sample{'data'} }) {
\$vec_sorted->set(\$_, \$sorted[\$_]);
}
};

# Auxiliary helper functions:

printf "\tSum = %g\n", msparser::ms_quant_stats::sum(\$vec);
printf "\tSum of squares = %g\n", msparser::ms_quant_stats::sumsq(\$vec);
printf "\tLog transformed = [%s]\n", join(', ', @{ msparser::ms_quant_stats::logTransform(\$vec) });
printf "\tExp transformed = [%s]\n", join(', ', @{ msparser::ms_quant_stats::expTransform(\$vec) });

# Basic sample statistics:

my \$xbar = msparser::ms_quant_stats::arithmeticMean(\$vec);
my \$stdev = msparser::ms_quant_stats::arithmeticStandardDeviation(\$vec, \$xbar);
my \$median = msparser::ms_quant_stats::sortedMedian(\$vec_sorted);
# If the vector isn't sorted:
# my \$median = msparser::ms_quant_stats::unsortedMedian(\$vec);

my \$geombar = msparser::ms_quant_stats::geometricMean(\$vec);
my \$geomdev = msparser::ms_quant_stats::geometricStandardDeviation(\$vec, \$geombar);

printf "\tArithmetic stdev = %g\n", \$stdev;
printf "\tArithmetic mean = %g\n", \$xbar;
printf "\tStandard error of mean = %g\n", msparser::ms_quant_stats::arithmeticStandardErrorOfMean(\$stdev, \$vec->size());
printf "\tMedian = %g\n", \$median;
printf "\tStandard error of median = %g\n", msparser::ms_quant_stats::arithmeticStandardErrorOfMedian(\$stdev, \$vec->size());
printf "\tGeometric mean = %g\n", \$geombar;
printf "\tGeometric stdev = %g\n", \$geomdev;

# Basic distribution tests for normally distributed data:

printf "\tHypothesis: distribution is N(%g, sigma^2) (mean test). p-value = %g\n",
\$\$sample{'popmean'},
msparser::ms_quant_stats::calculateNormalMeanPvalue(\$\$sample{'popmean'}, \$xbar, \$stdev, \$vec->size());

printf "\tHypothesis: distribution is N(%g, sigma^2) (median test). p-value = %g\n",
\$\$sample{'popmean'},
msparser::ms_quant_stats::calculateNormalMedianPvalue(\$\$sample{'popmean'}, \$median, \$stdev, \$vec->size());

# (We would expect the normal sample to have the highest p-value, i.e.
# least evidence against the hypothesis, while the uniform and beta
# distribution should have fairly low p-values. However, sample size is
# so small that the tests are not very reliable.)

do {
my (\$ok, \$W, \$p) = msparser::ms_quant_stats::testShapiroWilkW(\$vec_sorted);
printf "\tShapiro-Wilk ok = %d, W = %s, p-value = %s\n", \$ok, (defined \$W ? \$W : '(none)'), (defined \$p ? \$p : '(none)');
};

# Outlier detection can be done even when the above tests indicate that
# the sample is not normally distributed. However, don't trust the results!

do {
# Note that the input vector must be sorted for outlier testing to
# work.
my (\$numL, \$numH) = msparser::ms_quant_stats::detectOutliers(\$vec_sorted, "auto", 0.05);

# \$numL and \$numH are the number of elements at each end of \$vec_sorted
# which seem to be outliers. That is, elements between 0 and \$numL-1,
# and \$vec_sorted->size()-\$numH to \$vec_sorted->size()-1.

printf "\tOutlier detection('auto', 0.05') = <%s, %s>\n", (defined \$numL ? \$numL : '(none)'), (defined \$numH ? \$numH : '(none)');
};

print "\n";
}

=pod

Running the program as

perl -I../bin tools_stats.pl

will give the following output under Mascot 2.5:

Binomial coefficients for sample size 6:
6 choose 0 = 1
6 choose 1 = 6
6 choose 2 = 15
6 choose 3 = 20
6 choose 4 = 15
6 choose 5 = 6
6 choose 6 = 1
First 10 values of the Poisson(lambda = 1) distribution:
Poisson(lambda 1, k 1) = 0.367879
Poisson(lambda 1, k 2) = 0.18394
Poisson(lambda 1, k 3) = 0.0613132
Poisson(lambda 1, k 4) = 0.0153283
Poisson(lambda 1, k 5) = 0.00306566
Poisson(lambda 1, k 6) = 0.000510944
Poisson(lambda 1, k 7) = 7.2992e-05
Poisson(lambda 1, k 8) = 9.12399e-06
Poisson(lambda 1, k 9) = 1.01378e-06
Poisson(lambda 1, k 10) = 1.01378e-07
Common values of the standard normal distribution:
Normal-1(0.001) = -3.09
Normal-1(0.01) = -2.318
Normal-1(0.025) = -1.959
Normal-1(0.05) = -1.642
Normal-1(0.1) = -1.282
Normal-1(0.5) = 0
Normal-1(0.9) = 1.282
Normal-1(0.95) = 1.642
Normal-1(0.975) = 1.959
Normal-1(0.99) = 2.318
Normal-1(0.999) = 3.09
Normal(x <= -3.09) = 0.001
Normal(x <= -2.32) = 0.00554
Normal(x <= -1.96) = 0.025
Normal(x <= -1.64) = 0.0505
Normal(x <= -1.28) = 0.10027
Normal(x <= 0) = 0.5
Normal(x <= 1.28) = 0.89973
Normal(x <= 1.64) = 0.9495
Normal(x <= 1.96) = 0.975
Normal(x <= 2.32) = 0.99446
Normal(x <= 3.09) = 0.999
Common values of the chi-square distribution at 10 degrees of freedom:
chi-sq(df 10, sigLevel 0.001) = <1.479, 29.588>
chi-sq(df 10, sigLevel 0.01) = <2.558, 23.209>
chi-sq(df 10, sigLevel 0.025) = <3.247, 20.483>
chi-sq(df 10, sigLevel 0.05) = <3.94, 18.307>
chi-sq(df 10, sigLevel 0.1) = <4.865, 15.987>
Common values of the Student's t distribution at 10 degrees of freedom:
t(df 10, sigLevel 0.001) = 4.143
t(df 10, sigLevel 0.005) = 3.169
t(df 10, sigLevel 0.01) = 2.764
t(df 10, sigLevel 0.025) = 2.228
t(df 10, sigLevel 0.05) = 1.812
t(df 10, sigLevel 0.1) = 1.372
Common values of the F distribution at 10 and 1 degrees of freedom:
F(df1 10, df2 1, sigLevel 0.01) = 6055.85
F(df1 10, df2 1, sigLevel 0.05) = 241.882
F(df1 10, df2 1, sigLevel 0.1) = 60.195

Sample 'uniform(0,1)' = [0.400313346021907, 0.635154745166318, 0.795328049493161, 0.465079196585926, 0.709667388012424, 0.76161797692474, 0.617230566355229, 0.282583560369936, 0.251706165020014, 0.2962014901575]
Sum = 5.21488
Sum of squares = 3.10813
Log transformed = [-0.915507673489646, -0.4538866166025, -0.229000608568736, -0.765547572658224, -0.342958886300134, -0.272310191628172, -0.482512635488649, -1.26378098321243, -1.37949288361674, -1.21671534624448]
Exp transformed = [1.49229222822126, 1.88731417160841, 2.2151675622189, 1.59214027596837, 2.03331484127, 2.14173870243289, 1.85378698617621, 1.32655261769313, 1.28621804590024, 1.34474108162913]
Arithmetic stdev = 0.2078
Arithmetic mean = 0.521488
Standard error of mean = 0.065712
Median = 0.541155
Standard error of median = 0.0773299
Geometric mean = 0.480864
Geometric stdev = 1.54969
Hypothesis: distribution is N(0.5, sigma^2) (mean test). p-value = 0.75114
Hypothesis: distribution is N(0.5, sigma^2) (median test). p-value = 0.607474
Shapiro-Wilk ok = 1, W = 0.904955259787869, p-value = 0.248103833545881
Outlier detection('auto', 0.05') = <0, 0>

Sample 'normal(5,2.5)' = [3.1476833, 9.2061900, 2.6079399, 4.4361054, 6.4133568, 4.3745899, 3.5588195, 3.0542840, 0.6643241, 4.3697818]
Sum = 41.8331
Sum of squares = 222.941
Log transformed = [1.14666672193647, 2.21987608389596, 0.958560599320351, 1.48977682935013, 1.85838281560642, 1.47581277827312, 1.26942888874853, 1.11654519526921, -0.408985146179298, 1.47471307651077]
Exp transformed = [23.2820644952793, 9958.58228914508, 13.5710642846326, 84.4454192814535, 609.93768731081, 79.4072679533238, 35.1217114826069, 21.2059965963608, 1.94317668425472, 79.0263862606734]
Arithmetic stdev = 2.30796
Arithmetic mean = 4.18331
Standard error of mean = 0.72984
Median = 3.9643
Standard error of median = 0.858876
Geometric mean = 3.5257
Geometric stdev = 2.00172
Hypothesis: distribution is N(5, sigma^2) (mean test). p-value = 0.292114
Hypothesis: distribution is N(5, sigma^2) (median test). p-value = 0.258609
Shapiro-Wilk ok = 1, W = 0.917418368639979, p-value = 0.335925333025062
Outlier detection('auto', 0.05') = <0, 0>

Sample 'beta(0.5,0.5)' = [0.754152463, 0.008992998, 0.221637853, 0.426146895, 0.656421042, 0.224818243, 0.022050868, 0.539143737, 0.877643550, 0.128030530]
Sum = 3.85904
Sum of squares = 2.3588
Log transformed = [-0.28216072584468, -4.71130900444498, -1.50671052190953, -0.852971168207217, -0.420952863607245, -1.49246301212003, -3.8144033127847, -0.617773070154802, -0.130514747277869, -2.05548652787785]
Exp transformed = [2.12580905780365, 1.00903355649617, 1.24811929335097, 1.53134570563969, 1.92788017044885, 1.2520951184563, 1.02229578528984, 1.71453813789789, 2.40522523023951, 1.13658770218992]
Arithmetic stdev = 0.310837
Arithmetic mean = 0.385904
Standard error of mean = 0.0982953
Median = 0.325483
Standard error of median = 0.115674
Geometric mean = 0.204237
Geometric stdev = 4.71093
Hypothesis: distribution is N(0.5, sigma^2) (mean test). p-value = 0.275606
Hypothesis: distribution is N(0.5, sigma^2) (median test). p-value = 0.165651
Shapiro-Wilk ok = 1, W = 0.930482521074392, p-value = 0.452655082151213
Outlier detection('auto', 0.05') = <0, 0>

=cut
```