.\" Automatically generated by Pod::Man 2.25 (Pod::Simple 3.16) .\" .\" Standard preamble: .\" ======================================================================== .de Sp \" Vertical space (when we can't use .PP) .if t .sp .5v .if n .sp .. .de Vb \" Begin verbatim text .ft CW .nf .ne \\$1 .. .de Ve \" End verbatim text .ft R .fi .. .\" Set up some character translations and predefined strings. \*(-- will .\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left .\" double quote, and \*(R" will give a right double quote. \*(C+ will .\" give a nicer C++. Capital omega is used to do unbreakable dashes and .\" therefore won't be available. \*(C` and \*(C' expand to `' in nroff, .\" nothing in troff, for use with C<>. .tr \(*W- .ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p' .ie n \{\ . ds -- \(*W- . ds PI pi . if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch . if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\" diablo 12 pitch . ds L" "" . ds R" "" . ds C` "" . ds C' "" 'br\} .el\{\ . ds -- \|\(em\| . ds PI \(*p . ds L" `` . ds R" '' 'br\} .\" .\" Escape single quotes in literal strings from groff's Unicode transform. .ie \n(.g .ds Aq \(aq .el .ds Aq ' .\" .\" If the F register is turned on, we'll generate index entries on stderr for .\" titles (.TH), headers (.SH), subsections (.SS), items (.Ip), and index .\" entries marked with X<> in POD. Of course, you'll have to process the .\" output yourself in some meaningful fashion. .ie \nF \{\ . de IX . tm Index:\\$1\t\\n%\t"\\$2" .. . nr % 0 . rr F .\} .el \{\ . de IX .. .\} .\" .\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2). .\" Fear. Run. Save yourself. No user-serviceable parts. . \" fudge factors for nroff and troff .if n \{\ . ds #H 0 . ds #V .8m . ds #F .3m . ds #[ \f1 . ds #] \fP .\} .if t \{\ . ds #H ((1u-(\\\\n(.fu%2u))*.13m) . ds #V .6m . ds #F 0 . ds #[ \& . ds #] \& .\} . \" simple accents for nroff and troff .if n \{\ . ds ' \& . ds ` \& . ds ^ \& . ds , \& . ds ~ ~ . ds / .\} .if t \{\ . ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u" . ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u' . ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u' . ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u' . ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u' . ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u' .\} . \" troff and (daisy-wheel) nroff accents .ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V' .ds 8 \h'\*(#H'\(*b\h'-\*(#H' .ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#] .ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H' .ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u' .ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#] .ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#] .ds ae a\h'-(\w'a'u*4/10)'e .ds Ae A\h'-(\w'A'u*4/10)'E . \" corrections for vroff .if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u' .if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u' . \" for low resolution devices (crt and lpr) .if \n(.H>23 .if \n(.V>19 \ \{\ . ds : e . ds 8 ss . ds o a . ds d- d\h'-1'\(ga . ds D- D\h'-1'\(hy . ds th \o'bp' . ds Th \o'LP' . ds ae ae . ds Ae AE .\} .rm #[ #] #H #V #F C .\" ======================================================================== .\" .IX Title "Basic 3pm" .TH Basic 3pm "2012-06-04" "perl v5.14.2" "User Contributed Perl Documentation" .\" For nroff, turn off justification. Always turn off hyphenation; it makes .\" way too many mistakes in technical documents. .if n .ad l .nh .SH "NAME" PDL::Stats::Basic \-\- basic statistics and related utilities such as standard deviation, Pearson correlation, and t\-tests. .SH "DESCRIPTION" .IX Header "DESCRIPTION" The terms \s-1FUNCTIONS\s0 and \s-1METHODS\s0 are arbitrarily used to refer to methods that are threadable and methods that are \s-1NOT\s0 threadable, respectively. .PP Does not have mean or median function here. see \s-1SEE\s0 \s-1ALSO\s0. .SH "SYNOPSIS" .IX Header "SYNOPSIS" .Vb 3 \& use PDL::LiteF; \& use PDL::NiceSlice; \& use PDL::Stats::Basic; \& \& my $stdv = $data\->stdv; .Ve .PP or .PP .Vb 1 \& my $stdv = stdv( $data ); .Ve .SH "FUNCTIONS" .IX Header "FUNCTIONS" .SS "stdv" .IX Subsection "stdv" .Vb 1 \& Signature: (a(n); float+ [o]b()) .Ve .PP Sample standard deviation. .PP stdv does handle bad values. It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles. .SS "stdv_unbiased" .IX Subsection "stdv_unbiased" .Vb 1 \& Signature: (a(n); float+ [o]b()) .Ve .PP Unbiased estimate of population standard deviation. .PP stdv_unbiased does handle bad values. It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles. .SS "var" .IX Subsection "var" .Vb 1 \& Signature: (a(n); float+ [o]b()) .Ve .PP Sample variance. .PP var does handle bad values. It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles. .SS "var_unbiased" .IX Subsection "var_unbiased" .Vb 1 \& Signature: (a(n); float+ [o]b()) .Ve .PP Unbiased estimate of population variance. .PP var_unbiased does handle bad values. It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles. .SS "se" .IX Subsection "se" .Vb 1 \& Signature: (a(n); float+ [o]b()) .Ve .PP Standard error of the mean. Useful for calculating confidence intervals. .PP .Vb 1 \& # 95% confidence interval for samples with large N \& \& $ci_95_upper = $data\->average + 1.96 * $data\->se; \& $ci_95_lower = $data\->average \- 1.96 * $data\->se; .Ve .PP se does handle bad values. It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles. .SS "ss" .IX Subsection "ss" .Vb 1 \& Signature: (a(n); float+ [o]b()) .Ve .PP Sum of squared deviations from the mean. .PP ss does handle bad values. It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles. .SS "skew" .IX Subsection "skew" .Vb 1 \& Signature: (a(n); float+ [o]b()) .Ve .PP Sample skewness, measure of asymmetry in data. skewness == 0 for normal distribution. .PP skew does handle bad values. It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles. .SS "skew_unbiased" .IX Subsection "skew_unbiased" .Vb 1 \& Signature: (a(n); float+ [o]b()) .Ve .PP Unbiased estimate of population skewness. This is the number in GNumeric Descriptive Statistics. .PP skew_unbiased does handle bad values. It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles. .SS "kurt" .IX Subsection "kurt" .Vb 1 \& Signature: (a(n); float+ [o]b()) .Ve .PP Sample kurtosis, measure of \*(L"peakedness\*(R" of data. kurtosis == 0 for normal distribution. .PP kurt does handle bad values. It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles. .SS "kurt_unbiased" .IX Subsection "kurt_unbiased" .Vb 1 \& Signature: (a(n); float+ [o]b()) .Ve .PP Unbiased estimate of population kurtosis. This is the number in GNumeric Descriptive Statistics. .PP kurt_unbiased does handle bad values. It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles. .SS "cov" .IX Subsection "cov" .Vb 1 \& Signature: (a(n); b(n); float+ [o]c()) .Ve .PP Sample covariance. see \fBcorr\fR for ways to call .PP cov does handle bad values. It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles. .SS "cov_table" .IX Subsection "cov_table" .Vb 1 \& Signature: (a(n,m); float+ [o]c(m,m)) .Ve .PP Square covariance table. Gives the same result as threading using \fBcov\fR but it calculates only half the square, hence much faster. And it is easier to use with higher dimension pdls. .PP Usage: .PP .Vb 1 \& # 5 obs x 3 var, 2 such data tables \& \& perldl> $a = random 5, 3, 2 \& \& perldl> p $cov = $a\->cov_table \& [ \& [ \& [ 8.9636438 \-1.8624472 \-1.2416588] \& [\-1.8624472 14.341514 \-1.4245366] \& [\-1.2416588 \-1.4245366 9.8690655] \& ] \& [ \& [ 10.32644 \-0.31311789 \-0.95643674] \& [\-0.31311789 15.051779 \-7.2759577] \& [\-0.95643674 \-7.2759577 5.4465141] \& ] \& ] \& # diagonal elements of the cov table are the variances \& perldl> p $a\->var \& [ \& [ 8.9636438 14.341514 9.8690655] \& [ 10.32644 15.051779 5.4465141] \& ] .Ve .PP for the same cov matrix table using \fBcov\fR, .PP .Vb 1 \& perldl> p $a\->dummy(2)\->cov($a\->dummy(1)) .Ve .PP cov_table does handle bad values. It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles. .SS "corr" .IX Subsection "corr" .Vb 1 \& Signature: (a(n); b(n); float+ [o]c()) .Ve .PP Pearson correlation coefficient. r = cov(X,Y) / (stdv(X) * stdv(Y)). .PP Usage: .PP .Vb 3 \& perldl> $a = random 5, 3 \& perldl> $b = sequence 5,3 \& perldl> p $a\->corr($b) \& \& [0.20934208 0.30949881 0.26713007] .Ve .PP for square corr table .PP .Vb 1 \& perldl> p $a\->corr($a\->dummy(1)) \& \& [ \& [ 1 \-0.41995259 \-0.029301192] \& [ \-0.41995259 1 \-0.61927619] \& [\-0.029301192 \-0.61927619 1] \& ] .Ve .PP but it is easier and faster to use \fBcorr_table\fR. .PP corr does handle bad values. It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles. .SS "corr_table" .IX Subsection "corr_table" .Vb 1 \& Signature: (a(n,m); float+ [o]c(m,m)) .Ve .PP Square Pearson correlation table. Gives the same result as threading using \fBcorr\fR but it calculates only half the square, hence much faster. And it is easier to use with higher dimension pdls. .PP Usage: .PP .Vb 1 \& # 5 obs x 3 var, 2 such data tables \& \& perldl> $a = random 5, 3, 2 \& \& perldl> p $a\->corr_table \& [ \& [ \& [ 1 \-0.69835951 \-0.18549048] \& [\-0.69835951 1 0.72481605] \& [\-0.18549048 0.72481605 1] \& ] \& [ \& [ 1 0.82722569 \-0.71779883] \& [ 0.82722569 1 \-0.63938828] \& [\-0.71779883 \-0.63938828 1] \& ] \& ] .Ve .PP for the same result using \fBcorr\fR, .PP .Vb 1 \& perldl> p $a\->dummy(2)\->corr($a\->dummy(1)) .Ve .PP This is also how to use \fBt_corr\fR and \fBn_pair\fR with such a table. .PP corr_table does handle bad values. It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles. .SS "t_corr" .IX Subsection "t_corr" .Vb 1 \& Signature: (r(); n(); [o]t()) .Ve .PP .Vb 3 \& $corr = $data\->corr( $data\->dummy(1) ); \& $n = $data\->n_pair( $data\->dummy(1) ); \& $t_corr = $corr\->t_corr( $n ); \& \& use PDL::GSL::CDF; \& \& $p_2tail = 2 * (1 \- gsl_cdf_tdist_P( $t_corr\->abs, $n\-2 )); .Ve .PP t significance test for Pearson correlations. .PP t_corr does handle bad values. It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles. .SS "n_pair" .IX Subsection "n_pair" .Vb 1 \& Signature: (a(n); b(n); int [o]c()) .Ve .PP Returns the number of good pairs between 2 lists. Useful with \fBcorr\fR (esp. when bad values are involved) .PP n_pair does handle bad values. It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles. .SS "corr_dev" .IX Subsection "corr_dev" .Vb 1 \& Signature: (a(n); b(n); float+ [o]c()) .Ve .PP .Vb 1 \& $corr = $a\->dev_m\->corr_dev($b\->dev_m); .Ve .PP Calculates correlations from \fBdev_m\fR vals. Seems faster than doing \fBcorr\fR from original vals when data pdl is big .PP corr_dev does handle bad values. It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles. .SS "t_test" .IX Subsection "t_test" .Vb 1 \& Signature: (a(n); b(m); float+ [o]t(); [o]d()) .Ve .PP .Vb 1 \& my ($t, $df) = t_test( $pdl1, $pdl2 ); \& \& use PDL::GSL::CDF; \& \& my $p_2tail = 2 * (1 \- gsl_cdf_tdist_P( $t\->abs, $df )); .Ve .PP Independent sample t\-test, assuming equal var. .PP t_test does handle bad values. It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles. .SS "t_test_nev" .IX Subsection "t_test_nev" .Vb 1 \& Signature: (a(n); b(m); float+ [o]t(); [o]d()) .Ve .PP Independent sample t\-test, \s-1NOT\s0 assuming equal var. ie Welch two sample t test. Df follows Welch-Satterthwaite equation instead of Satterthwaite (1946, as cited by Hays, 1994, 5th ed.). It matches GNumeric, which matches R. .PP .Vb 1 \& my ($t, $df) = $pdl1\->t_test( $pdl2 ); .Ve .PP t_test_nev does handle bad values. It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles. .SS "t_test_paired" .IX Subsection "t_test_paired" .Vb 1 \& Signature: (a(n); b(n); float+ [o]t(); [o]d()) .Ve .PP Paired sample t\-test. .PP t_test_paired does handle bad values. It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles. .SS "binomial_test" .IX Subsection "binomial_test" .Vb 1 \& Signature: (x(); n(); p_expected(); [o]p()) .Ve .PP Binomial test. One-tailed significance test for two-outcome distribution. Given the number of success, the number of trials, and the expected probability of success, returns the probability of getting this many or more successes. .PP Usage: .PP .Vb 2 \& # assume a fair coin, ie. 0.5 probablity of getting heads \& # test whether getting 8 heads out of 10 coin flips is unusual \& \& my $p = binomial_test( 8, 10, 0.5 ); # 0.0107421875. Yes it is unusual. .Ve .SH "METHODS" .IX Header "METHODS" .SS "rtable" .IX Subsection "rtable" Reads either file or file handle*. Returns observation x variable pdl and var and obs ids if specified. Ids in perl @ ref to allow for non-numeric ids. Other non-numeric entries are treated as missing, which are filled with \f(CW$opt\fR{\s-1MISSN\s0} then set to BAD*. Can specify num of data rows to read from top but not arbitrary range. .PP *If passed handle, it will not be closed here. .PP *PDL::Bad::setvaltobad only works consistently with the default \s-1TYPE\s0 double before \s-1PDL\-2\s0.4.4_04. .PP Default options (case insensitive): .PP .Vb 8 \& V => 1, # verbose. prints simple status \& TYPE => double, \& C_ID => 1, # boolean. file has col id. \& R_ID => 1, # boolean. file has row id. \& R_VAR => 0, # boolean. set to 1 if var in rows \& SEP => "\et", # can take regex qr// \& MISSN => \-999, # this value treated as missing and set to BAD \& NROW => \*(Aq\*(Aq, # set to read specified num of data rows .Ve .PP Usage: .PP Sample file diet.txt: .PP .Vb 5 \& uid height weight diet \& akw 72 320 1 \& bcm 68 268 1 \& clq 67 180 2 \& dwm 70 200 2 \& \& ($data, $idv, $ido) = rtable \*(Aqdiet.txt\*(Aq; \& \& # By default prints out data info and @$idv index and element \& \& reading diet.txt for data and id... OK. \& data table as PDL dim o x v: PDL: Double D [4,3] \& 0 height \& 1 weight \& 2 diet .Ve .PP Another way of using it, .PP .Vb 1 \& $data = rtable( \e*STDIN, {TYPE=>long} ); .Ve .SS "group_by" .IX Subsection "group_by" Returns pdl reshaped according to the specified factor variable. Most useful when used in conjunction with other threading calculations such as average, stdv, etc. When the factor variable contains unequal number of cases in each level, the returned pdl is padded with bad values to fit the level with the most number of cases. This allows the subsequent calculation (average, stdv, etc) to return the correct results for each level. .PP Usage: .PP .Vb 1 \& # simple case with 1d pdl and equal number of n in each level of the factor \& \& pdl> p $a = sequence 10 \& [0 1 2 3 4 5 6 7 8 9] \& \& pdl> p $factor = $a > 4 \& [0 0 0 0 0 1 1 1 1 1] \& \& pdl> p $a\->group_by( $factor )\->average \& [2 7] \& \& # more complex case with threading and unequal number of n across levels in the factor \& \& pdl> p $a = sequence 10,2 \& [ \& [ 0 1 2 3 4 5 6 7 8 9] \& [10 11 12 13 14 15 16 17 18 19] \& ] \& \& pdl> p $factor = qsort $a( ,0) % 3 \& [ \& [0 0 0 0 1 1 1 2 2 2] \& ] \& \& pdl> p $a\->group_by( $factor ) \& [ \& [ \& [ 0 1 2 3] \& [10 11 12 13] \& ] \& [ \& [ 4 5 6 BAD] \& [ 14 15 16 BAD] \& ] \& [ \& [ 7 8 9 BAD] \& [ 17 18 19 BAD] \& ] \& ] \& ARRAY(0xa2a4e40) \& \& # group_by supports perl factors, multiple factors \& # returns factor labels in addition to pdl in array context \& \& pdl> p $a = sequence 12 \& [0 1 2 3 4 5 6 7 8 9 10 11] \& \& pdl> $odd_even = [qw( e o e o e o e o e o e o )] \& \& pdl> $magnitude = [qw( l l l l l l h h h h h h )] \& \& pdl> ($a_grouped, $label) = $a\->group_by( $odd_even, $magnitude ) \& \& pdl> p $a_grouped \& [ \& [ \& [0 2 4] \& [1 3 5] \& ] \& [ \& [ 6 8 10] \& [ 7 9 11] \& ] \& ] \& \& pdl> p Dumper $label \& $VAR1 = [ \& [ \& \*(Aqe_l\*(Aq, \& \*(Aqo_l\*(Aq \& ], \& [ \& \*(Aqe_h\*(Aq, \& \*(Aqo_h\*(Aq \& ] \& ]; .Ve .SS "which_id" .IX Subsection "which_id" Lookup specified var (obs) ids in \f(CW$idv\fR ($ido) (see \fBrtable\fR) and return indices in \f(CW$idv\fR ($ido) as pdl if found. The indices are ordered by the specified subset. Useful for selecting data by var (obs) id. .PP .Vb 1 \& my $ind = which_id $ido, [\*(Aqsmith\*(Aq, \*(Aqsummers\*(Aq, \*(Aqtesla\*(Aq]; \& \& my $data_subset = $data( $ind, ); \& \& # take advantage of perl pattern matching \& # e.g. use data from people whose last name starts with s \& \& my $i = which_id $ido, [ grep { /^s/ } @$ido ]; \& \& my $data_s = $data($i, ); .Ve .SH "SEE ALSO" .IX Header "SEE ALSO" PDL::Basic (hist for frequency counts) .PP PDL::Ufunc (sum, avg, median, min, max, etc.) .PP \&\s-1PDL::GSL::CDF\s0 (various cumulative distribution functions) .SH "REFERENCES" .IX Header "REFERENCES" Hays, W.L. (1994). Statistics (5th ed.). Fort Worth, \s-1TX:\s0 Harcourt Brace College Publishers. .SH "AUTHOR" .IX Header "AUTHOR" Copyright (C) 2009 Maggie J. Xiong .PP All rights reserved. There is no warranty. You are allowed to redistribute this software / documentation as described in the file \s-1COPYING\s0 in the \s-1PDL\s0 distribution.