changeset 11650:a8e0f7184a59 release-3-0-x

handle ties in kruskal_wallis_test
author Timo Lindfors
date Fri, 15 Feb 2008 16:50:16 -0500
parents 196a759fc7e8
children 74de76325d12
files doc/interpreter/contributors.in scripts/ChangeLog scripts/general/runlength.m scripts/statistics/tests/kruskal_wallis_test.m
diffstat 4 files changed, 64 insertions(+), 4 deletions(-) [+]
line wrap: on
line diff
--- a/doc/interpreter/contributors.in
+++ b/doc/interpreter/contributors.in
@@ -104,6 +104,7 @@
 Dirk Laurie
 Maurice LeBrun
 Friedrich Leisch
+Timo Lindfors
 Benjamin Lindner
 Ross Lippert
 David Livings
--- a/scripts/ChangeLog
+++ b/scripts/ChangeLog
@@ -1,3 +1,8 @@
+2008-02-15  Timo Lindfors  <timo.lindfors@iki.fi>
+
+	* statistics/tests/kruskal_wallis_test.m: Handle ties.
+	* general/runlength.m: New function from Paul Kienzle.
+
 2008-02-15  Rolf Fabian  <r.fabian@jacobs-university.de>
 
 	* linear-algebra/cond.m: New optional second argument to
new file mode 100644
--- /dev/null
+++ b/scripts/general/runlength.m
@@ -0,0 +1,36 @@
+## Copyright (C) 2005, 2008 Paul Kienzle
+##
+## This file is part of Octave.
+##
+## Octave is free software; you can redistribute it and/or modify it
+## under the terms of the GNU General Public License as published by
+## the Free Software Foundation; either version 3 of the License, or (at
+## your option) any later version.
+##
+## Octave is distributed in the hope that it will be useful, but
+## WITHOUT ANY WARRANTY; without even the implied warranty of
+## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+## General Public License for more details.
+##
+## You should have received a copy of the GNU General Public License
+## along with Octave; see the file COPYING.  If not, see
+## <http://www.gnu.org/licenses/>.
+
+## -*- texinfo -*-
+## @deftypefn {Function File} {} runlength (@var{x})
+## Find the lengths of all sequences of common values.  Return the
+## vector of lengths and the value that was repeated.
+##
+## @example
+## runlength ([2, 2, 0, 4, 4, 4, 0, 1, 1, 1, 1])
+## @result{}  [2, 1, 3, 1, 4]
+## @end example
+## @end deftypefn
+
+function [count, value] = runlength (x)
+  idx = [find(x(1:end-1) != x(2:end)), length(x)];
+  value = x(idx);
+  count = diff ([0 idx]);
+endfunction
+
+%!assert (runlength([2 2 0 4 4 4 0 1 1 1 1]), [2 1 3 1 4]);
--- a/scripts/statistics/tests/kruskal_wallis_test.m
+++ b/scripts/statistics/tests/kruskal_wallis_test.m
@@ -29,6 +29,18 @@
 ## approximately chi-square with @var{df} = @var{k} - 1 degrees of
 ## freedom.
 ##
+## If the data contains ties (some value appears more than once)
+## @var{k} is divided by
+## 
+## 1 - @var{sumTies} / ( @var{n}^3 - @var{n} )
+##
+## where @var{sumTies} is the sum of @var{t}^2 - @var{t} over each group
+## of ties where @var{t} is the number of ties in the group and @var{n}
+## is the total number of values in the input data. For more info on
+## this adjustment see "Use of Ranks in One-Criterion Variance Analysis"
+## in Journal of the American Statistical Association, Vol. 47,
+## No. 260 (Dec 1952) by William H. Kruskal and W. Allen Wallis.
+##
 ## The p-value (1 minus the CDF of this distribution at @var{k}) is
 ## returned in @var{pval}.
 ##
@@ -67,9 +79,14 @@
     j = j + n(i);
   endfor
 
-  n    = length (p);
-  k    = 12 * k / (n * (n + 1)) - 3 * (n + 1);
-  df   = m - 1;
+  n = length (p);
+  k = 12 * k / (n * (n + 1)) - 3 * (n + 1);
+
+  ## Adjust the result to takes ties into account.
+  sum_ties = sum (polyval ([1, 0, -1, 0], runlength (sort (p))));
+  k = k / (1 - sum_ties / (n^3 - n));
+
+  df = m - 1;
   pval = 1 - chisquare_cdf (k, df);
 
   if (nargout == 0)
@@ -78,4 +95,5 @@
 
 endfunction
 
-
+## Test with ties
+%!assert (abs(kruskal_wallis_test([86 86], [74]) - 0.157299207050285) < 0.0000000000001)