Using statistical routines.
#!/usr/local/bin/perl ############################################################################## # file: tools_stats.pl # # Mascot Parser toolkit example code # ############################################################################## # COPYRIGHT NOTICE # # Copyright 1998-2012 Matrix Science Limited All Rights Reserved. # # # ############################################################################## # $Source: parser/examples/test_perl/tools_stats.pl $ # $Author: villek@matrixscience.com $ # $Date: 2018-07-30 16:23:53 +0100 $ # $Revision: 1b450440f9c97e1e41d0fc6016a27d68951d4532 | MSPARSER_REL_2_8_1-0-gea32989045 $ ############################################################################## use strict; ############################################################################## use msparser; print "Binomial coefficients for sample size 6:\n"; for my $i (0 .. 6) { printf "\t6 choose %d = %g\n", $i, msparser::ms_quant_stats::binomialCoefficient(6, $i); } print "First 10 values of the Poisson(lambda = 1) distribution:\n"; for my $i (1 .. 10) { printf "\tPoisson(lambda 1, k %d) = %g\n", $i, msparser::ms_quant_stats::poissonDensity(1, $i); } print "Common values of the standard normal distribution:\n"; for (0.001, 0.01, 0.025, 0.05, 0.1, 0.5, 0.9, 0.95, 0.975, 0.99, 0.999) { # Note that normalCriticalValue() takes an argument between 0.5 and 1.0; # that is, it returns only the right tail critical values. my $critval; if ($_ < 0.5) { $critval = -msparser::ms_quant_stats::normalCriticalValue(1 - $_); } else { $critval = msparser::ms_quant_stats::normalCriticalValue($_); } printf "\tNormal-1(%g) = %g\n", $_, $critval; } for (-3.09, -2.32, -1.96, -1.64, -1.28, 0, 1.28, 1.64, 1.96, 2.32, 3.09) { # Note that normalCumulativeProbability() takes an argument between 0 and # 4.09; that is, it reports only the right tail probability. my $p; if ($_ < 0) { $p = 0.5 - msparser::ms_quant_stats::normalCumulativeProbability(-$_); } else { $p = 0.5 + msparser::ms_quant_stats::normalCumulativeProbability($_); } printf "\tNormal(x <= %g) = %g\n", $_, $p; } print "Common values of the chi-square distribution at 10 degrees of freedom:\n"; # Note the significance levels the chi-square functions support. for (0.001, 0.01, 0.025, 0.05, 0.1) { my $lowcrit = msparser::ms_quant_stats::chisqLowerCriticalValue(10, $_); my $upcrit = msparser::ms_quant_stats::chisqUpperCriticalValue(10, $_); printf "\tchi-sq(df 10, sigLevel %g) = <%g, %g>\n", $_, $lowcrit, $upcrit; } print "Common values of the Student's t distribution at 10 degrees of freedom:\n"; # Note the significance levels the Student's t function supports. for (0.001, 0.005, 0.01, 0.025, 0.05, 0.1) { printf "\tt(df 10, sigLevel %g) = %g\n", $_, msparser::ms_quant_stats::studentsCriticalValue(10, $_); } print "Common values of the F distribution at 10 and 1 degrees of freedom:\n"; # Note the significance levels the F function supports. for (0.01, 0.05, 0.1) { printf "\tF(df1 10, df2 1, sigLevel %g) = %g\n", $_, msparser::ms_quant_stats::snedecorsCriticalValue(10, 1, $_); } my @samples = ( # Uniform data generated with # perl -e 'print join "\n", map {+ rand() } 0 .. 9' { name => 'uniform(0,1)', popmean => 0.5, data => [split /\s+/, <<EOD] }, 0.400313346021907 0.635154745166318 0.795328049493161 0.465079196585926 0.709667388012424 0.76161797692474 0.617230566355229 0.282583560369936 0.251706165020014 0.2962014901575 EOD # Normal(5, 2.5) data generated in R with # > rnorm(10, 5, 2.5) { name => 'normal(5,2.5)', popmean => 5, data => [split /\s+/, <<EOD] }, 3.1476833 9.2061900 2.6079399 4.4361054 6.4133568 4.3745899 3.5588195 3.0542840 0.6643241 4.3697818 EOD # Beta(0.5, 0.5) data generated in R with # > rbeta(10, 0.5, 0.5) { name => 'beta(0.5,0.5)', popmean => 0.5, data => [split /\s+/, <<EOD] }, 0.754152463 0.008992998 0.221637853 0.426146895 0.656421042 0.224818243 0.022050868 0.539143737 0.877643550 0.128030530 EOD ); print "\n"; for my $sample (@samples) { printf "Sample '%s' = [%s]\n", $sample->{'name'}, join(', ', @{ $$sample{'data'} }); # ms_quant_stats takes as input STL vectors of type double. We can easily # convert a Perl array into one: my $vec = msparser::vectord->new(scalar(@{ $$sample{'data'} })); for (0 .. $#{ $$sample{'data'} }) { $vec->set($_, $$sample{'data'}[$_]); } # The sorted sample vector is needed for the Shapiro-Wilk W test. # It also makes finding the sample median fast. my $vec_sorted; do { my @sorted = sort { $a <=> $b } @{ $$sample{'data'} }; $vec_sorted = msparser::vectord->new(scalar(@sorted)); for (0 .. $#{ $$sample{'data'} }) { $vec_sorted->set($_, $sorted[$_]); } }; # Auxiliary helper functions: printf "\tSum = %g\n", msparser::ms_quant_stats::sum($vec); printf "\tSum of squares = %g\n", msparser::ms_quant_stats::sumsq($vec); printf "\tLog transformed = [%s]\n", join(', ', @{ msparser::ms_quant_stats::logTransform($vec) }); printf "\tExp transformed = [%s]\n", join(', ', @{ msparser::ms_quant_stats::expTransform($vec) }); # Basic sample statistics: my $xbar = msparser::ms_quant_stats::arithmeticMean($vec); my $stdev = msparser::ms_quant_stats::arithmeticStandardDeviation($vec, $xbar); my $median = msparser::ms_quant_stats::sortedMedian($vec_sorted); # If the vector isn't sorted: # my $median = msparser::ms_quant_stats::unsortedMedian($vec); my $geombar = msparser::ms_quant_stats::geometricMean($vec); my $geomdev = msparser::ms_quant_stats::geometricStandardDeviation($vec, $geombar); printf "\tArithmetic stdev = %g\n", $stdev; printf "\tArithmetic mean = %g\n", $xbar; printf "\tStandard error of mean = %g\n", msparser::ms_quant_stats::arithmeticStandardErrorOfMean($stdev, $vec->size()); printf "\tMedian = %g\n", $median; printf "\tStandard error of median = %g\n", msparser::ms_quant_stats::arithmeticStandardErrorOfMedian($stdev, $vec->size()); printf "\tGeometric mean = %g\n", $geombar; printf "\tGeometric stdev = %g\n", $geomdev; # Basic distribution tests for normally distributed data: printf "\tHypothesis: distribution is N(%g, sigma^2) (mean test). p-value = %g\n", $$sample{'popmean'}, msparser::ms_quant_stats::calculateNormalMeanPvalue($$sample{'popmean'}, $xbar, $stdev, $vec->size()); printf "\tHypothesis: distribution is N(%g, sigma^2) (median test). p-value = %g\n", $$sample{'popmean'}, msparser::ms_quant_stats::calculateNormalMedianPvalue($$sample{'popmean'}, $median, $stdev, $vec->size()); # (We would expect the normal sample to have the highest p-value, i.e. # least evidence against the hypothesis, while the uniform and beta # distribution should have fairly low p-values. However, sample size is # so small that the tests are not very reliable.) do { my ($ok, $W, $p) = msparser::ms_quant_stats::testShapiroWilkW($vec_sorted); printf "\tShapiro-Wilk ok = %d, W = %s, p-value = %s\n", $ok, (defined $W ? $W : '(none)'), (defined $p ? $p : '(none)'); }; # Outlier detection can be done even when the above tests indicate that # the sample is not normally distributed. However, don't trust the results! do { # Note that the input vector must be sorted for outlier testing to # work. my ($numL, $numH) = msparser::ms_quant_stats::detectOutliers($vec_sorted, "auto", 0.05); # $numL and $numH are the number of elements at each end of $vec_sorted # which seem to be outliers. That is, elements between 0 and $numL-1, # and $vec_sorted->size()-$numH to $vec_sorted->size()-1. printf "\tOutlier detection('auto', 0.05') = <%s, %s>\n", (defined $numL ? $numL : '(none)'), (defined $numH ? $numH : '(none)'); }; print "\n"; } =pod Running the program as perl -I../bin tools_stats.pl will give the following output under Mascot 2.5: Binomial coefficients for sample size 6: 6 choose 0 = 1 6 choose 1 = 6 6 choose 2 = 15 6 choose 3 = 20 6 choose 4 = 15 6 choose 5 = 6 6 choose 6 = 1 First 10 values of the Poisson(lambda = 1) distribution: Poisson(lambda 1, k 1) = 0.367879 Poisson(lambda 1, k 2) = 0.18394 Poisson(lambda 1, k 3) = 0.0613132 Poisson(lambda 1, k 4) = 0.0153283 Poisson(lambda 1, k 5) = 0.00306566 Poisson(lambda 1, k 6) = 0.000510944 Poisson(lambda 1, k 7) = 7.2992e-05 Poisson(lambda 1, k 8) = 9.12399e-06 Poisson(lambda 1, k 9) = 1.01378e-06 Poisson(lambda 1, k 10) = 1.01378e-07 Common values of the standard normal distribution: Normal-1(0.001) = -3.09 Normal-1(0.01) = -2.318 Normal-1(0.025) = -1.959 Normal-1(0.05) = -1.642 Normal-1(0.1) = -1.282 Normal-1(0.5) = 0 Normal-1(0.9) = 1.282 Normal-1(0.95) = 1.642 Normal-1(0.975) = 1.959 Normal-1(0.99) = 2.318 Normal-1(0.999) = 3.09 Normal(x <= -3.09) = 0.001 Normal(x <= -2.32) = 0.00554 Normal(x <= -1.96) = 0.025 Normal(x <= -1.64) = 0.0505 Normal(x <= -1.28) = 0.10027 Normal(x <= 0) = 0.5 Normal(x <= 1.28) = 0.89973 Normal(x <= 1.64) = 0.9495 Normal(x <= 1.96) = 0.975 Normal(x <= 2.32) = 0.99446 Normal(x <= 3.09) = 0.999 Common values of the chi-square distribution at 10 degrees of freedom: chi-sq(df 10, sigLevel 0.001) = <1.479, 29.588> chi-sq(df 10, sigLevel 0.01) = <2.558, 23.209> chi-sq(df 10, sigLevel 0.025) = <3.247, 20.483> chi-sq(df 10, sigLevel 0.05) = <3.94, 18.307> chi-sq(df 10, sigLevel 0.1) = <4.865, 15.987> Common values of the Student's t distribution at 10 degrees of freedom: t(df 10, sigLevel 0.001) = 4.143 t(df 10, sigLevel 0.005) = 3.169 t(df 10, sigLevel 0.01) = 2.764 t(df 10, sigLevel 0.025) = 2.228 t(df 10, sigLevel 0.05) = 1.812 t(df 10, sigLevel 0.1) = 1.372 Common values of the F distribution at 10 and 1 degrees of freedom: F(df1 10, df2 1, sigLevel 0.01) = 6055.85 F(df1 10, df2 1, sigLevel 0.05) = 241.882 F(df1 10, df2 1, sigLevel 0.1) = 60.195 Sample 'uniform(0,1)' = [0.400313346021907, 0.635154745166318, 0.795328049493161, 0.465079196585926, 0.709667388012424, 0.76161797692474, 0.617230566355229, 0.282583560369936, 0.251706165020014, 0.2962014901575] Sum = 5.21488 Sum of squares = 3.10813 Log transformed = [-0.915507673489646, -0.4538866166025, -0.229000608568736, -0.765547572658224, -0.342958886300134, -0.272310191628172, -0.482512635488649, -1.26378098321243, -1.37949288361674, -1.21671534624448] Exp transformed = [1.49229222822126, 1.88731417160841, 2.2151675622189, 1.59214027596837, 2.03331484127, 2.14173870243289, 1.85378698617621, 1.32655261769313, 1.28621804590024, 1.34474108162913] Arithmetic stdev = 0.2078 Arithmetic mean = 0.521488 Standard error of mean = 0.065712 Median = 0.541155 Standard error of median = 0.0773299 Geometric mean = 0.480864 Geometric stdev = 1.54969 Hypothesis: distribution is N(0.5, sigma^2) (mean test). p-value = 0.75114 Hypothesis: distribution is N(0.5, sigma^2) (median test). p-value = 0.607474 Shapiro-Wilk ok = 1, W = 0.904955259787869, p-value = 0.248103833545881 Outlier detection('auto', 0.05') = <0, 0> Sample 'normal(5,2.5)' = [3.1476833, 9.2061900, 2.6079399, 4.4361054, 6.4133568, 4.3745899, 3.5588195, 3.0542840, 0.6643241, 4.3697818] Sum = 41.8331 Sum of squares = 222.941 Log transformed = [1.14666672193647, 2.21987608389596, 0.958560599320351, 1.48977682935013, 1.85838281560642, 1.47581277827312, 1.26942888874853, 1.11654519526921, -0.408985146179298, 1.47471307651077] Exp transformed = [23.2820644952793, 9958.58228914508, 13.5710642846326, 84.4454192814535, 609.93768731081, 79.4072679533238, 35.1217114826069, 21.2059965963608, 1.94317668425472, 79.0263862606734] Arithmetic stdev = 2.30796 Arithmetic mean = 4.18331 Standard error of mean = 0.72984 Median = 3.9643 Standard error of median = 0.858876 Geometric mean = 3.5257 Geometric stdev = 2.00172 Hypothesis: distribution is N(5, sigma^2) (mean test). p-value = 0.292114 Hypothesis: distribution is N(5, sigma^2) (median test). p-value = 0.258609 Shapiro-Wilk ok = 1, W = 0.917418368639979, p-value = 0.335925333025062 Outlier detection('auto', 0.05') = <0, 0> Sample 'beta(0.5,0.5)' = [0.754152463, 0.008992998, 0.221637853, 0.426146895, 0.656421042, 0.224818243, 0.022050868, 0.539143737, 0.877643550, 0.128030530] Sum = 3.85904 Sum of squares = 2.3588 Log transformed = [-0.28216072584468, -4.71130900444498, -1.50671052190953, -0.852971168207217, -0.420952863607245, -1.49246301212003, -3.8144033127847, -0.617773070154802, -0.130514747277869, -2.05548652787785] Exp transformed = [2.12580905780365, 1.00903355649617, 1.24811929335097, 1.53134570563969, 1.92788017044885, 1.2520951184563, 1.02229578528984, 1.71453813789789, 2.40522523023951, 1.13658770218992] Arithmetic stdev = 0.310837 Arithmetic mean = 0.385904 Standard error of mean = 0.0982953 Median = 0.325483 Standard error of median = 0.115674 Geometric mean = 0.204237 Geometric stdev = 4.71093 Hypothesis: distribution is N(0.5, sigma^2) (mean test). p-value = 0.275606 Hypothesis: distribution is N(0.5, sigma^2) (median test). p-value = 0.165651 Shapiro-Wilk ok = 1, W = 0.930482521074392, p-value = 0.452655082151213 Outlier detection('auto', 0.05') = <0, 0> =cut
Copyright © 2022 Matrix Science Ltd. All Rights Reserved. Generated on Thu Mar 31 2022 01:12:29 |