/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.commons.math3.stat.inference;
import java.math.BigDecimal;
import java.util.Arrays;
import java.util.HashSet;
import org.apache.commons.math3.distribution.EnumeratedRealDistribution;
import org.apache.commons.math3.distribution.RealDistribution;
import org.apache.commons.math3.distribution.UniformRealDistribution;
import org.apache.commons.math3.exception.InsufficientDataException;
import org.apache.commons.math3.exception.MathArithmeticException;
import org.apache.commons.math3.exception.MathInternalError;
import org.apache.commons.math3.exception.NullArgumentException;
import org.apache.commons.math3.exception.NumberIsTooLargeException;
import org.apache.commons.math3.exception.OutOfRangeException;
import org.apache.commons.math3.exception.TooManyIterationsException;
import org.apache.commons.math3.exception.util.LocalizedFormats;
import org.apache.commons.math3.fraction.BigFraction;
import org.apache.commons.math3.fraction.BigFractionField;
import org.apache.commons.math3.fraction.FractionConversionException;
import org.apache.commons.math3.linear.Array2DRowFieldMatrix;
import org.apache.commons.math3.linear.FieldMatrix;
import org.apache.commons.math3.linear.MatrixUtils;
import org.apache.commons.math3.linear.RealMatrix;
import org.apache.commons.math3.random.JDKRandomGenerator;
import org.apache.commons.math3.random.RandomGenerator;
import org.apache.commons.math3.random.Well19937c;
import org.apache.commons.math3.util.CombinatoricsUtils;
import org.apache.commons.math3.util.FastMath;
import org.apache.commons.math3.util.MathArrays;
import org.apache.commons.math3.util.MathUtils;
Implementation of the
Kolmogorov-Smirnov (K-S) test for equality of continuous distributions.
The K-S test uses a statistic based on the maximum deviation of the empirical distribution of
sample data points from the distribution expected under the null hypothesis. For one-sample tests
evaluating the null hypothesis that a set of sample data points follow a given distribution, the
test statistic is \(D_n=\sup_x |F_n(x)-F(x)|\), where \(F\) is the expected distribution and
\(F_n\) is the empirical distribution of the \(n\) sample data points. The distribution of
\(D_n\) is estimated using a method based on [1] with certain quick decisions for extreme values
given in [2].
Two-sample tests are also supported, evaluating the null hypothesis that the two samples x
and y
come from the same underlying distribution. In this case, the test statistic is \(D_{n,m}=\sup_t | F_n(t)-F_m(t)|\) where \(n\) is the length of x
, \(m\) is the length of y
, \(F_n\) is the empirical distribution that puts mass \(1/n\) at each of the values in x
and \(F_m\) is the empirical distribution of the y
values. The default 2-sample test method, kolmogorovSmirnovTest(double[], double[])
works as follows:
- For small samples (where the product of the sample sizes is less than 10000), the method presented in [4] is used to compute the exact p-value for the 2-sample test.
- When the product of the sample sizes exceeds 10000, the asymptotic distribution of \(D_{n,m}\) is used. See
approximateP(double, int, int)
for details on the approximation.
If the product of the sample sizes is less than 10000 and the sample data contains ties, random jitter is added to the sample data to break ties before applying the algorithm above. Alternatively, the bootstrap(double[], double[], int, boolean)
method, modeled after ks.boot
in the R Matching package [3], can be used if ties are known to be present in the data.
In the two-sample case, \(D_{n,m}\) has a discrete distribution. This makes the p-value associated with the null hypothesis \(H_0 : D_{n,m} \ge d \) differ from \(H_0 : D_{n,m} > d \) by the mass of the observed value \(d\). To distinguish these, the two-sample tests use a boolean strict
parameter. This parameter is ignored for large samples.
The methods used by the 2-sample default implementation are also exposed directly:
exactP(double, int, int, boolean)
computes exact 2-sample p-values
approximateP(double, int, int)
uses the asymptotic distribution The boolean
arguments in the first two methods allow the probability used to estimate the p-value to be expressed using strict or non-strict inequality. See kolmogorovSmirnovTest(double[], double[], boolean)
.
References:
- [1] Evaluating Kolmogorov's Distribution by
George Marsaglia, Wai Wan Tsang, and Jingbo Wang
- [2] Computing the Two-Sided Kolmogorov-Smirnov
Distribution by Richard Simard and Pierre L'Ecuyer
- [3] Jasjeet S. Sekhon. 2011.
Multivariate and Propensity Score Matching Software with Automated Balance Optimization:
The Matching package for R Journal of Statistical Software, 42(7): 1-52.
- [4] Wilcox, Rand. 2012. Introduction to Robust Estimation and Hypothesis Testing,
Chapter 5, 3rd Ed. Academic Press.
Note that [1] contains an error in computing h, refer to MATH-437 for details.
Since: 3.3
/**
* Implementation of the <a href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test">
* Kolmogorov-Smirnov (K-S) test</a> for equality of continuous distributions.
* <p>
* The K-S test uses a statistic based on the maximum deviation of the empirical distribution of
* sample data points from the distribution expected under the null hypothesis. For one-sample tests
* evaluating the null hypothesis that a set of sample data points follow a given distribution, the
* test statistic is \(D_n=\sup_x |F_n(x)-F(x)|\), where \(F\) is the expected distribution and
* \(F_n\) is the empirical distribution of the \(n\) sample data points. The distribution of
* \(D_n\) is estimated using a method based on [1] with certain quick decisions for extreme values
* given in [2].
* </p>
* <p>
* Two-sample tests are also supported, evaluating the null hypothesis that the two samples
* {@code x} and {@code y} come from the same underlying distribution. In this case, the test
* statistic is \(D_{n,m}=\sup_t | F_n(t)-F_m(t)|\) where \(n\) is the length of {@code x}, \(m\) is
* the length of {@code y}, \(F_n\) is the empirical distribution that puts mass \(1/n\) at each of
* the values in {@code x} and \(F_m\) is the empirical distribution of the {@code y} values. The
* default 2-sample test method, {@link #kolmogorovSmirnovTest(double[], double[])} works as
* follows:
* <ul>
* <li>For small samples (where the product of the sample sizes is less than
* {@value #LARGE_SAMPLE_PRODUCT}), the method presented in [4] is used to compute the
* exact p-value for the 2-sample test.</li>
* <li>When the product of the sample sizes exceeds {@value #LARGE_SAMPLE_PRODUCT}, the asymptotic
* distribution of \(D_{n,m}\) is used. See {@link #approximateP(double, int, int)} for details on
* the approximation.</li>
* </ul></p><p>
* If the product of the sample sizes is less than {@value #LARGE_SAMPLE_PRODUCT} and the sample
* data contains ties, random jitter is added to the sample data to break ties before applying
* the algorithm above. Alternatively, the {@link #bootstrap(double[], double[], int, boolean)}
* method, modeled after <a href="http://sekhon.berkeley.edu/matching/ks.boot.html">ks.boot</a>
* in the R Matching package [3], can be used if ties are known to be present in the data.
* </p>
* <p>
* In the two-sample case, \(D_{n,m}\) has a discrete distribution. This makes the p-value
* associated with the null hypothesis \(H_0 : D_{n,m} \ge d \) differ from \(H_0 : D_{n,m} > d \)
* by the mass of the observed value \(d\). To distinguish these, the two-sample tests use a boolean
* {@code strict} parameter. This parameter is ignored for large samples.
* </p>
* <p>
* The methods used by the 2-sample default implementation are also exposed directly:
* <ul>
* <li>{@link #exactP(double, int, int, boolean)} computes exact 2-sample p-values</li>
* <li>{@link #approximateP(double, int, int)} uses the asymptotic distribution The {@code boolean}
* arguments in the first two methods allow the probability used to estimate the p-value to be
* expressed using strict or non-strict inequality. See
* {@link #kolmogorovSmirnovTest(double[], double[], boolean)}.</li>
* </ul>
* </p>
* <p>
* References:
* <ul>
* <li>[1] <a href="http://www.jstatsoft.org/v08/i18/"> Evaluating Kolmogorov's Distribution</a> by
* George Marsaglia, Wai Wan Tsang, and Jingbo Wang</li>
* <li>[2] <a href="http://www.jstatsoft.org/v39/i11/"> Computing the Two-Sided Kolmogorov-Smirnov
* Distribution</a> by Richard Simard and Pierre L'Ecuyer</li>
* <li>[3] Jasjeet S. Sekhon. 2011. <a href="http://www.jstatsoft.org/article/view/v042i07">
* Multivariate and Propensity Score Matching Software with Automated Balance Optimization:
* The Matching package for R</a> Journal of Statistical Software, 42(7): 1-52.</li>
* <li>[4] Wilcox, Rand. 2012. Introduction to Robust Estimation and Hypothesis Testing,
* Chapter 5, 3rd Ed. Academic Press.</li>
* </ul>
* <br/>
* Note that [1] contains an error in computing h, refer to <a
* href="https://issues.apache.org/jira/browse/MATH-437">MATH-437</a> for details.
* </p>
*
* @since 3.3
*/
public class KolmogorovSmirnovTest {
Bound on the number of partial sums in ksSum(double, double, int)
/**
* Bound on the number of partial sums in {@link #ksSum(double, double, int)}
*/
protected static final int MAXIMUM_PARTIAL_SUM_COUNT = 100000;
Convergence criterion for ksSum(double, double, int)
/** Convergence criterion for {@link #ksSum(double, double, int)} */
protected static final double KS_SUM_CAUCHY_CRITERION = 1E-20;
Convergence criterion for the sums in #pelzGood(double, double, int)} /** Convergence criterion for the sums in #pelzGood(double, double, int)} */
protected static final double PG_SUM_RELATIVE_ERROR = 1.0e-10;
No longer used. /** No longer used. */
@Deprecated
protected static final int SMALL_SAMPLE_PRODUCT = 200;
When product of sample sizes exceeds this value, 2-sample K-S test uses asymptotic
distribution to compute the p-value.
/**
* When product of sample sizes exceeds this value, 2-sample K-S test uses asymptotic
* distribution to compute the p-value.
*/
protected static final int LARGE_SAMPLE_PRODUCT = 10000;
Default number of iterations used by monteCarloP(double, int, int, boolean, int)
. Deprecated as of version 3.6, as this method is no longer needed. /** Default number of iterations used by {@link #monteCarloP(double, int, int, boolean, int)}.
* Deprecated as of version 3.6, as this method is no longer needed. */
@Deprecated
protected static final int MONTE_CARLO_ITERATIONS = 1000000;
Random data generator used by monteCarloP(double, int, int, boolean, int)
/** Random data generator used by {@link #monteCarloP(double, int, int, boolean, int)} */
private final RandomGenerator rng;
Construct a KolmogorovSmirnovTest instance with a default random data generator.
/**
* Construct a KolmogorovSmirnovTest instance with a default random data generator.
*/
public KolmogorovSmirnovTest() {
rng = new Well19937c();
}
Construct a KolmogorovSmirnovTest with the provided random data generator.
The #monteCarloP(double, int, int, boolean, int) that uses the generator supplied to this
constructor is deprecated as of version 3.6.
Params: - rng – random data generator used by
monteCarloP(double, int, int, boolean, int)
/**
* Construct a KolmogorovSmirnovTest with the provided random data generator.
* The #monteCarloP(double, int, int, boolean, int) that uses the generator supplied to this
* constructor is deprecated as of version 3.6.
*
* @param rng random data generator used by {@link #monteCarloP(double, int, int, boolean, int)}
*/
@Deprecated
public KolmogorovSmirnovTest(RandomGenerator rng) {
this.rng = rng;
}
Computes the p-value, or observed significance level, of a one-sample Kolmogorov-Smirnov test evaluating the null hypothesis that data
conforms to distribution
. If exact
is true, the distribution used to compute the p-value is computed using extended precision. See cdfExact(double, int)
. Params: - distribution – reference distribution
- data – sample being being evaluated
- exact – whether or not to force exact computation of the p-value
Throws: - InsufficientDataException – if
data
does not have length at least 2 - NullArgumentException – if
data
is null
Returns: the p-value associated with the null hypothesis that data
is a sample from distribution
/**
* Computes the <i>p-value</i>, or <i>observed significance level</i>, of a one-sample <a
* href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov test</a>
* evaluating the null hypothesis that {@code data} conforms to {@code distribution}. If
* {@code exact} is true, the distribution used to compute the p-value is computed using
* extended precision. See {@link #cdfExact(double, int)}.
*
* @param distribution reference distribution
* @param data sample being being evaluated
* @param exact whether or not to force exact computation of the p-value
* @return the p-value associated with the null hypothesis that {@code data} is a sample from
* {@code distribution}
* @throws InsufficientDataException if {@code data} does not have length at least 2
* @throws NullArgumentException if {@code data} is null
*/
public double kolmogorovSmirnovTest(RealDistribution distribution, double[] data, boolean exact) {
return 1d - cdf(kolmogorovSmirnovStatistic(distribution, data), data.length, exact);
}
Computes the one-sample Kolmogorov-Smirnov test statistic, \(D_n=\sup_x |F_n(x)-F(x)|\) where \(F\) is the distribution (cdf) function associated with distribution
, \(n\) is the length of data
and \(F_n\) is the empirical distribution that puts mass \(1/n\) at each of the values in data
. Params: - distribution – reference distribution
- data – sample being evaluated
Throws: - InsufficientDataException – if
data
does not have length at least 2 - NullArgumentException – if
data
is null
Returns: Kolmogorov-Smirnov statistic \(D_n\)
/**
* Computes the one-sample Kolmogorov-Smirnov test statistic, \(D_n=\sup_x |F_n(x)-F(x)|\) where
* \(F\) is the distribution (cdf) function associated with {@code distribution}, \(n\) is the
* length of {@code data} and \(F_n\) is the empirical distribution that puts mass \(1/n\) at
* each of the values in {@code data}.
*
* @param distribution reference distribution
* @param data sample being evaluated
* @return Kolmogorov-Smirnov statistic \(D_n\)
* @throws InsufficientDataException if {@code data} does not have length at least 2
* @throws NullArgumentException if {@code data} is null
*/
public double kolmogorovSmirnovStatistic(RealDistribution distribution, double[] data) {
checkArray(data);
final int n = data.length;
final double nd = n;
final double[] dataCopy = new double[n];
System.arraycopy(data, 0, dataCopy, 0, n);
Arrays.sort(dataCopy);
double d = 0d;
for (int i = 1; i <= n; i++) {
final double yi = distribution.cumulativeProbability(dataCopy[i - 1]);
final double currD = FastMath.max(yi - (i - 1) / nd, i / nd - yi);
if (currD > d) {
d = currD;
}
}
return d;
}
Computes the p-value, or observed significance level, of a two-sample Kolmogorov-Smirnov test evaluating the null hypothesis that x
and y
are samples drawn from the same probability distribution. Specifically, what is returned is an estimate of the probability that the kolmogorovSmirnovStatistic(double[], double[])
associated with a randomly selected partition of the combined sample into subsamples of sizes x.length
and y.length
will strictly exceed (if strict
is true
) or be at least as large as strict = false
) as kolmogorovSmirnovStatistic(x, y)
.
- For small samples (where the product of the sample sizes is less than 10000), the exact p-value is computed using the method presented in [4], implemented in
exactP(double, int, int, boolean)
.
- When the product of the sample sizes exceeds 10000, the asymptotic distribution of \(D_{n,m}\) is used. See
approximateP(double, int, int)
for details on the approximation.
If x.length * y.length
< 10000 and the combined set of values in x
and y
contains ties, random jitter is added to x
and y
to break ties before computing \(D_{n,m}\) and the p-value. The jitter is uniformly distributed on (-minDelta / 2, minDelta / 2) where minDelta is the smallest pairwise difference between values in the combined sample.
If ties are known to be present in the data, bootstrap(double[], double[], int, boolean)
may be used as an alternative method for estimating the p-value.
Params: - x – first sample dataset
- y – second sample dataset
- strict – whether or not the probability to compute is expressed as a strict inequality
(ignored for large samples)
Throws: - InsufficientDataException – if either
x
or y
does not have length at least 2 - NullArgumentException – if either
x
or y
is null
See Also: Returns: p-value associated with the null hypothesis that x
and y
represent samples from the same distribution
/**
* Computes the <i>p-value</i>, or <i>observed significance level</i>, of a two-sample <a
* href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov test</a>
* evaluating the null hypothesis that {@code x} and {@code y} are samples drawn from the same
* probability distribution. Specifically, what is returned is an estimate of the probability
* that the {@link #kolmogorovSmirnovStatistic(double[], double[])} associated with a randomly
* selected partition of the combined sample into subsamples of sizes {@code x.length} and
* {@code y.length} will strictly exceed (if {@code strict} is {@code true}) or be at least as
* large as {@code strict = false}) as {@code kolmogorovSmirnovStatistic(x, y)}.
* <ul>
* <li>For small samples (where the product of the sample sizes is less than
* {@value #LARGE_SAMPLE_PRODUCT}), the exact p-value is computed using the method presented
* in [4], implemented in {@link #exactP(double, int, int, boolean)}. </li>
* <li>When the product of the sample sizes exceeds {@value #LARGE_SAMPLE_PRODUCT}, the
* asymptotic distribution of \(D_{n,m}\) is used. See {@link #approximateP(double, int, int)}
* for details on the approximation.</li>
* </ul><p>
* If {@code x.length * y.length} < {@value #LARGE_SAMPLE_PRODUCT} and the combined set of values in
* {@code x} and {@code y} contains ties, random jitter is added to {@code x} and {@code y} to
* break ties before computing \(D_{n,m}\) and the p-value. The jitter is uniformly distributed
* on (-minDelta / 2, minDelta / 2) where minDelta is the smallest pairwise difference between
* values in the combined sample.</p>
* <p>
* If ties are known to be present in the data, {@link #bootstrap(double[], double[], int, boolean)}
* may be used as an alternative method for estimating the p-value.</p>
*
* @param x first sample dataset
* @param y second sample dataset
* @param strict whether or not the probability to compute is expressed as a strict inequality
* (ignored for large samples)
* @return p-value associated with the null hypothesis that {@code x} and {@code y} represent
* samples from the same distribution
* @throws InsufficientDataException if either {@code x} or {@code y} does not have length at
* least 2
* @throws NullArgumentException if either {@code x} or {@code y} is null
* @see #bootstrap(double[], double[], int, boolean)
*/
public double kolmogorovSmirnovTest(double[] x, double[] y, boolean strict) {
final long lengthProduct = (long) x.length * y.length;
double[] xa = null;
double[] ya = null;
if (lengthProduct < LARGE_SAMPLE_PRODUCT && hasTies(x,y)) {
xa = MathArrays.copyOf(x);
ya = MathArrays.copyOf(y);
fixTies(xa, ya);
} else {
xa = x;
ya = y;
}
if (lengthProduct < LARGE_SAMPLE_PRODUCT) {
return exactP(kolmogorovSmirnovStatistic(xa, ya), x.length, y.length, strict);
}
return approximateP(kolmogorovSmirnovStatistic(x, y), x.length, y.length);
}
Computes the p-value, or observed significance level, of a two-sample Kolmogorov-Smirnov test evaluating the null hypothesis that x
and y
are samples drawn from the same probability distribution. Assumes the strict form of the inequality used to compute the p-value. See kolmogorovSmirnovTest(RealDistribution, double[], boolean)
. Params: - x – first sample dataset
- y – second sample dataset
Throws: - InsufficientDataException – if either
x
or y
does not have length at least 2 - NullArgumentException – if either
x
or y
is null
Returns: p-value associated with the null hypothesis that x
and y
represent samples from the same distribution
/**
* Computes the <i>p-value</i>, or <i>observed significance level</i>, of a two-sample <a
* href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov test</a>
* evaluating the null hypothesis that {@code x} and {@code y} are samples drawn from the same
* probability distribution. Assumes the strict form of the inequality used to compute the
* p-value. See {@link #kolmogorovSmirnovTest(RealDistribution, double[], boolean)}.
*
* @param x first sample dataset
* @param y second sample dataset
* @return p-value associated with the null hypothesis that {@code x} and {@code y} represent
* samples from the same distribution
* @throws InsufficientDataException if either {@code x} or {@code y} does not have length at
* least 2
* @throws NullArgumentException if either {@code x} or {@code y} is null
*/
public double kolmogorovSmirnovTest(double[] x, double[] y) {
return kolmogorovSmirnovTest(x, y, true);
}
Computes the two-sample Kolmogorov-Smirnov test statistic, \(D_{n,m}=\sup_x |F_n(x)-F_m(x)|\) where \(n\) is the length of x
, \(m\) is the length of y
, \(F_n\) is the empirical distribution that puts mass \(1/n\) at each of the values in x
and \(F_m\) is the empirical distribution of the y
values. Params: - x – first sample
- y – second sample
Throws: - InsufficientDataException – if either
x
or y
does not have length at least 2 - NullArgumentException – if either
x
or y
is null
Returns: test statistic \(D_{n,m}\) used to evaluate the null hypothesis that x
and y
represent samples from the same underlying distribution
/**
* Computes the two-sample Kolmogorov-Smirnov test statistic, \(D_{n,m}=\sup_x |F_n(x)-F_m(x)|\)
* where \(n\) is the length of {@code x}, \(m\) is the length of {@code y}, \(F_n\) is the
* empirical distribution that puts mass \(1/n\) at each of the values in {@code x} and \(F_m\)
* is the empirical distribution of the {@code y} values.
*
* @param x first sample
* @param y second sample
* @return test statistic \(D_{n,m}\) used to evaluate the null hypothesis that {@code x} and
* {@code y} represent samples from the same underlying distribution
* @throws InsufficientDataException if either {@code x} or {@code y} does not have length at
* least 2
* @throws NullArgumentException if either {@code x} or {@code y} is null
*/
public double kolmogorovSmirnovStatistic(double[] x, double[] y) {
return integralKolmogorovSmirnovStatistic(x, y)/((double)(x.length * (long)y.length));
}
Computes the two-sample Kolmogorov-Smirnov test statistic, \(D_{n,m}=\sup_x |F_n(x)-F_m(x)|\) where \(n\) is the length of x
, \(m\) is the length of y
, \(F_n\) is the empirical distribution that puts mass \(1/n\) at each of the values in x
and \(F_m\) is the empirical distribution of the y
values. Finally \(n m D_{n,m}\) is returned as long value. Params: - x – first sample
- y – second sample
Throws: - InsufficientDataException – if either
x
or y
does not have length at least 2 - NullArgumentException – if either
x
or y
is null
Returns: test statistic \(n m D_{n,m}\) used to evaluate the null hypothesis that x
and y
represent samples from the same underlying distribution
/**
* Computes the two-sample Kolmogorov-Smirnov test statistic, \(D_{n,m}=\sup_x |F_n(x)-F_m(x)|\)
* where \(n\) is the length of {@code x}, \(m\) is the length of {@code y}, \(F_n\) is the
* empirical distribution that puts mass \(1/n\) at each of the values in {@code x} and \(F_m\)
* is the empirical distribution of the {@code y} values. Finally \(n m D_{n,m}\) is returned
* as long value.
*
* @param x first sample
* @param y second sample
* @return test statistic \(n m D_{n,m}\) used to evaluate the null hypothesis that {@code x} and
* {@code y} represent samples from the same underlying distribution
* @throws InsufficientDataException if either {@code x} or {@code y} does not have length at
* least 2
* @throws NullArgumentException if either {@code x} or {@code y} is null
*/
private long integralKolmogorovSmirnovStatistic(double[] x, double[] y) {
checkArray(x);
checkArray(y);
// Copy and sort the sample arrays
final double[] sx = MathArrays.copyOf(x);
final double[] sy = MathArrays.copyOf(y);
Arrays.sort(sx);
Arrays.sort(sy);
final int n = sx.length;
final int m = sy.length;
int rankX = 0;
int rankY = 0;
long curD = 0l;
// Find the max difference between cdf_x and cdf_y
long supD = 0l;
do {
double z = Double.compare(sx[rankX], sy[rankY]) <= 0 ? sx[rankX] : sy[rankY];
while(rankX < n && Double.compare(sx[rankX], z) == 0) {
rankX += 1;
curD += m;
}
while(rankY < m && Double.compare(sy[rankY], z) == 0) {
rankY += 1;
curD -= n;
}
if (curD > supD) {
supD = curD;
}
else if (-curD > supD) {
supD = -curD;
}
} while(rankX < n && rankY < m);
return supD;
}
Computes the p-value, or observed significance level, of a one-sample Kolmogorov-Smirnov test evaluating the null hypothesis that data
conforms to distribution
. Params: - distribution – reference distribution
- data – sample being being evaluated
Throws: - InsufficientDataException – if
data
does not have length at least 2 - NullArgumentException – if
data
is null
Returns: the p-value associated with the null hypothesis that data
is a sample from distribution
/**
* Computes the <i>p-value</i>, or <i>observed significance level</i>, of a one-sample <a
* href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov test</a>
* evaluating the null hypothesis that {@code data} conforms to {@code distribution}.
*
* @param distribution reference distribution
* @param data sample being being evaluated
* @return the p-value associated with the null hypothesis that {@code data} is a sample from
* {@code distribution}
* @throws InsufficientDataException if {@code data} does not have length at least 2
* @throws NullArgumentException if {@code data} is null
*/
public double kolmogorovSmirnovTest(RealDistribution distribution, double[] data) {
return kolmogorovSmirnovTest(distribution, data, false);
}
Performs a Kolmogorov-Smirnov
test evaluating the null hypothesis that data
conforms to distribution
. Params: - distribution – reference distribution
- data – sample being being evaluated
- alpha – significance level of the test
Throws: - InsufficientDataException – if
data
does not have length at least 2 - NullArgumentException – if
data
is null
Returns: true iff the null hypothesis that data
is a sample from distribution
can be rejected with confidence 1 - alpha
/**
* Performs a <a href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov
* test</a> evaluating the null hypothesis that {@code data} conforms to {@code distribution}.
*
* @param distribution reference distribution
* @param data sample being being evaluated
* @param alpha significance level of the test
* @return true iff the null hypothesis that {@code data} is a sample from {@code distribution}
* can be rejected with confidence 1 - {@code alpha}
* @throws InsufficientDataException if {@code data} does not have length at least 2
* @throws NullArgumentException if {@code data} is null
*/
public boolean kolmogorovSmirnovTest(RealDistribution distribution, double[] data, double alpha) {
if ((alpha <= 0) || (alpha > 0.5)) {
throw new OutOfRangeException(LocalizedFormats.OUT_OF_BOUND_SIGNIFICANCE_LEVEL, alpha, 0, 0.5);
}
return kolmogorovSmirnovTest(distribution, data) < alpha;
}
Estimates the p-value of a two-sample
Kolmogorov-Smirnov test evaluating the null hypothesis that x
and y
are samples drawn from the same probability distribution. This method estimates the p-value by repeatedly sampling sets of size x.length
and y.length
from the empirical distribution of the combined sample. When strict
is true, this is equivalent to the algorithm implemented in the R function ks.boot
, described in Jasjeet S. Sekhon. 2011. 'Multivariate and Propensity Score Matching
Software with Automated Balance Optimization: The Matching package for R.'
Journal of Statistical Software, 42(7): 1-52.
Params: - x – first sample
- y – second sample
- iterations – number of bootstrap resampling iterations
- strict – whether or not the null hypothesis is expressed as a strict inequality
Returns: estimated p-value
/**
* Estimates the <i>p-value</i> of a two-sample
* <a href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov test</a>
* evaluating the null hypothesis that {@code x} and {@code y} are samples drawn from the same
* probability distribution. This method estimates the p-value by repeatedly sampling sets of size
* {@code x.length} and {@code y.length} from the empirical distribution of the combined sample.
* When {@code strict} is true, this is equivalent to the algorithm implemented in the R function
* {@code ks.boot}, described in <pre>
* Jasjeet S. Sekhon. 2011. 'Multivariate and Propensity Score Matching
* Software with Automated Balance Optimization: The Matching package for R.'
* Journal of Statistical Software, 42(7): 1-52.
* </pre>
* @param x first sample
* @param y second sample
* @param iterations number of bootstrap resampling iterations
* @param strict whether or not the null hypothesis is expressed as a strict inequality
* @return estimated p-value
*/
public double bootstrap(double[] x, double[] y, int iterations, boolean strict) {
final int xLength = x.length;
final int yLength = y.length;
final double[] combined = new double[xLength + yLength];
System.arraycopy(x, 0, combined, 0, xLength);
System.arraycopy(y, 0, combined, xLength, yLength);
final EnumeratedRealDistribution dist = new EnumeratedRealDistribution(rng, combined);
final long d = integralKolmogorovSmirnovStatistic(x, y);
int greaterCount = 0;
int equalCount = 0;
double[] curX;
double[] curY;
long curD;
for (int i = 0; i < iterations; i++) {
curX = dist.sample(xLength);
curY = dist.sample(yLength);
curD = integralKolmogorovSmirnovStatistic(curX, curY);
if (curD > d) {
greaterCount++;
} else if (curD == d) {
equalCount++;
}
}
return strict ? greaterCount / (double) iterations :
(greaterCount + equalCount) / (double) iterations;
}
Computes bootstrap(x, y, iterations, true)
. This is equivalent to ks.boot(x,y, nboots=iterations) using the R Matching package function. See #bootstrap(double[], double[], int, boolean). Params: - x – first sample
- y – second sample
- iterations – number of bootstrap resampling iterations
Returns: estimated p-value
/**
* Computes {@code bootstrap(x, y, iterations, true)}.
* This is equivalent to ks.boot(x,y, nboots=iterations) using the R Matching
* package function. See #bootstrap(double[], double[], int, boolean).
*
* @param x first sample
* @param y second sample
* @param iterations number of bootstrap resampling iterations
* @return estimated p-value
*/
public double bootstrap(double[] x, double[] y, int iterations) {
return bootstrap(x, y, iterations, true);
}
Calculates \(P(D_n < d)\) using the method described in [1] with quick decisions for extreme values given in [2] (see above). The result is not exact as with cdfExact(double, int)
because calculations are based on double
rather than BigFraction
. Params: - d – statistic
- n – sample size
Throws: - MathArithmeticException – if algorithm fails to convert
h
to a BigFraction
in expressing d
as \((k - h) / m\) for integer k, m
and \(0 \le h < 1\)
Returns: \(P(D_n < d)\)
/**
* Calculates \(P(D_n < d)\) using the method described in [1] with quick decisions for extreme
* values given in [2] (see above). The result is not exact as with
* {@link #cdfExact(double, int)} because calculations are based on
* {@code double} rather than {@link org.apache.commons.math3.fraction.BigFraction}.
*
* @param d statistic
* @param n sample size
* @return \(P(D_n < d)\)
* @throws MathArithmeticException if algorithm fails to convert {@code h} to a
* {@link org.apache.commons.math3.fraction.BigFraction} in expressing {@code d} as \((k
* - h) / m\) for integer {@code k, m} and \(0 \le h < 1\)
*/
public double cdf(double d, int n)
throws MathArithmeticException {
return cdf(d, n, false);
}
Calculates P(D_n < d)
. The result is exact in the sense that BigFraction/BigReal is used everywhere at the expense of very slow execution time. Almost never choose this in real applications unless you are very sure; this is almost solely for verification purposes. Normally, you would choose cdf(double, int)
. See the class javadoc for definitions and algorithm description. Params: - d – statistic
- n – sample size
Throws: - MathArithmeticException – if the algorithm fails to convert
h
to a BigFraction
in expressing d
as \((k - h) / m\) for integer k, m
and \(0 \le h < 1\)
Returns: \(P(D_n < d)\)
/**
* Calculates {@code P(D_n < d)}. The result is exact in the sense that BigFraction/BigReal is
* used everywhere at the expense of very slow execution time. Almost never choose this in real
* applications unless you are very sure; this is almost solely for verification purposes.
* Normally, you would choose {@link #cdf(double, int)}. See the class
* javadoc for definitions and algorithm description.
*
* @param d statistic
* @param n sample size
* @return \(P(D_n < d)\)
* @throws MathArithmeticException if the algorithm fails to convert {@code h} to a
* {@link org.apache.commons.math3.fraction.BigFraction} in expressing {@code d} as \((k
* - h) / m\) for integer {@code k, m} and \(0 \le h < 1\)
*/
public double cdfExact(double d, int n)
throws MathArithmeticException {
return cdf(d, n, true);
}
Calculates P(D_n < d)
using method described in [1] with quick decisions for extreme values given in [2] (see above). Params: - d – statistic
- n – sample size
- exact – whether the probability should be calculated exact using
BigFraction
everywhere at the expense of very slow execution time, or if double
should be used convenient places to gain speed. Almost never choose true
in real applications unless you are very sure; true
is almost solely for verification purposes.
Throws: - MathArithmeticException – if algorithm fails to convert
h
to a BigFraction
in expressing d
as \((k - h) / m\) for integer k, m
and \(0 \le h < 1\).
Returns: \(P(D_n < d)\)
/**
* Calculates {@code P(D_n < d)} using method described in [1] with quick decisions for extreme
* values given in [2] (see above).
*
* @param d statistic
* @param n sample size
* @param exact whether the probability should be calculated exact using
* {@link org.apache.commons.math3.fraction.BigFraction} everywhere at the expense of
* very slow execution time, or if {@code double} should be used convenient places to
* gain speed. Almost never choose {@code true} in real applications unless you are very
* sure; {@code true} is almost solely for verification purposes.
* @return \(P(D_n < d)\)
* @throws MathArithmeticException if algorithm fails to convert {@code h} to a
* {@link org.apache.commons.math3.fraction.BigFraction} in expressing {@code d} as \((k
* - h) / m\) for integer {@code k, m} and \(0 \le h < 1\).
*/
public double cdf(double d, int n, boolean exact)
throws MathArithmeticException {
final double ninv = 1 / ((double) n);
final double ninvhalf = 0.5 * ninv;
if (d <= ninvhalf) {
return 0;
} else if (ninvhalf < d && d <= ninv) {
double res = 1;
final double f = 2 * d - ninv;
// n! f^n = n*f * (n-1)*f * ... * 1*x
for (int i = 1; i <= n; ++i) {
res *= i * f;
}
return res;
} else if (1 - ninv <= d && d < 1) {
return 1 - 2 * Math.pow(1 - d, n);
} else if (1 <= d) {
return 1;
}
if (exact) {
return exactK(d, n);
}
if (n <= 140) {
return roundedK(d, n);
}
return pelzGood(d, n);
}
Calculates the exact value of P(D_n < d)
using the method described in [1] (reference in class javadoc above) and BigFraction
(see above). Params: - d – statistic
- n – sample size
Throws: - MathArithmeticException – if algorithm fails to convert
h
to a BigFraction
in expressing d
as \((k - h) / m\) for integer k, m
and \(0 \le h < 1\).
Returns: the two-sided probability of \(P(D_n < d)\)
/**
* Calculates the exact value of {@code P(D_n < d)} using the method described in [1] (reference
* in class javadoc above) and {@link org.apache.commons.math3.fraction.BigFraction} (see
* above).
*
* @param d statistic
* @param n sample size
* @return the two-sided probability of \(P(D_n < d)\)
* @throws MathArithmeticException if algorithm fails to convert {@code h} to a
* {@link org.apache.commons.math3.fraction.BigFraction} in expressing {@code d} as \((k
* - h) / m\) for integer {@code k, m} and \(0 \le h < 1\).
*/
private double exactK(double d, int n)
throws MathArithmeticException {
final int k = (int) Math.ceil(n * d);
final FieldMatrix<BigFraction> H = this.createExactH(d, n);
final FieldMatrix<BigFraction> Hpower = H.power(n);
BigFraction pFrac = Hpower.getEntry(k - 1, k - 1);
for (int i = 1; i <= n; ++i) {
pFrac = pFrac.multiply(i).divide(n);
}
/*
* BigFraction.doubleValue converts numerator to double and the denominator to double and
* divides afterwards. That gives NaN quite easy. This does not (scale is the number of
* digits):
*/
return pFrac.bigDecimalValue(20, BigDecimal.ROUND_HALF_UP).doubleValue();
}
Calculates P(D_n < d)
using method described in [1] and doubles (see above). Params: - d – statistic
- n – sample size
Returns: \(P(D_n < d)\)
/**
* Calculates {@code P(D_n < d)} using method described in [1] and doubles (see above).
*
* @param d statistic
* @param n sample size
* @return \(P(D_n < d)\)
*/
private double roundedK(double d, int n) {
final int k = (int) Math.ceil(n * d);
final RealMatrix H = this.createRoundedH(d, n);
final RealMatrix Hpower = H.power(n);
double pFrac = Hpower.getEntry(k - 1, k - 1);
for (int i = 1; i <= n; ++i) {
pFrac *= (double) i / (double) n;
}
return pFrac;
}
Computes the Pelz-Good approximation for \(P(D_n < d)\) as described in [2] in the class javadoc.
Params: - d – value of d-statistic (x in [2])
- n – sample size
Returns: \(P(D_n < d)\) Since: 3.4
/**
* Computes the Pelz-Good approximation for \(P(D_n < d)\) as described in [2] in the class javadoc.
*
* @param d value of d-statistic (x in [2])
* @param n sample size
* @return \(P(D_n < d)\)
* @since 3.4
*/
public double pelzGood(double d, int n) {
// Change the variable since approximation is for the distribution evaluated at d / sqrt(n)
final double sqrtN = FastMath.sqrt(n);
final double z = d * sqrtN;
final double z2 = d * d * n;
final double z4 = z2 * z2;
final double z6 = z4 * z2;
final double z8 = z4 * z4;
// Eventual return value
double ret = 0;
// Compute K_0(z)
double sum = 0;
double increment = 0;
double kTerm = 0;
double z2Term = MathUtils.PI_SQUARED / (8 * z2);
int k = 1;
for (; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) {
kTerm = 2 * k - 1;
increment = FastMath.exp(-z2Term * kTerm * kTerm);
sum += increment;
if (increment <= PG_SUM_RELATIVE_ERROR * sum) {
break;
}
}
if (k == MAXIMUM_PARTIAL_SUM_COUNT) {
throw new TooManyIterationsException(MAXIMUM_PARTIAL_SUM_COUNT);
}
ret = sum * FastMath.sqrt(2 * FastMath.PI) / z;
// K_1(z)
// Sum is -inf to inf, but k term is always (k + 1/2) ^ 2, so really have
// twice the sum from k = 0 to inf (k = -1 is same as 0, -2 same as 1, ...)
final double twoZ2 = 2 * z2;
sum = 0;
kTerm = 0;
double kTerm2 = 0;
for (k = 0; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) {
kTerm = k + 0.5;
kTerm2 = kTerm * kTerm;
increment = (MathUtils.PI_SQUARED * kTerm2 - z2) * FastMath.exp(-MathUtils.PI_SQUARED * kTerm2 / twoZ2);
sum += increment;
if (FastMath.abs(increment) < PG_SUM_RELATIVE_ERROR * FastMath.abs(sum)) {
break;
}
}
if (k == MAXIMUM_PARTIAL_SUM_COUNT) {
throw new TooManyIterationsException(MAXIMUM_PARTIAL_SUM_COUNT);
}
final double sqrtHalfPi = FastMath.sqrt(FastMath.PI / 2);
// Instead of doubling sum, divide by 3 instead of 6
ret += sum * sqrtHalfPi / (3 * z4 * sqrtN);
// K_2(z)
// Same drill as K_1, but with two doubly infinite sums, all k terms are even powers.
final double z4Term = 2 * z4;
final double z6Term = 6 * z6;
z2Term = 5 * z2;
final double pi4 = MathUtils.PI_SQUARED * MathUtils.PI_SQUARED;
sum = 0;
kTerm = 0;
kTerm2 = 0;
for (k = 0; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) {
kTerm = k + 0.5;
kTerm2 = kTerm * kTerm;
increment = (z6Term + z4Term + MathUtils.PI_SQUARED * (z4Term - z2Term) * kTerm2 +
pi4 * (1 - twoZ2) * kTerm2 * kTerm2) * FastMath.exp(-MathUtils.PI_SQUARED * kTerm2 / twoZ2);
sum += increment;
if (FastMath.abs(increment) < PG_SUM_RELATIVE_ERROR * FastMath.abs(sum)) {
break;
}
}
if (k == MAXIMUM_PARTIAL_SUM_COUNT) {
throw new TooManyIterationsException(MAXIMUM_PARTIAL_SUM_COUNT);
}
double sum2 = 0;
kTerm2 = 0;
for (k = 1; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) {
kTerm2 = k * k;
increment = MathUtils.PI_SQUARED * kTerm2 * FastMath.exp(-MathUtils.PI_SQUARED * kTerm2 / twoZ2);
sum2 += increment;
if (FastMath.abs(increment) < PG_SUM_RELATIVE_ERROR * FastMath.abs(sum2)) {
break;
}
}
if (k == MAXIMUM_PARTIAL_SUM_COUNT) {
throw new TooManyIterationsException(MAXIMUM_PARTIAL_SUM_COUNT);
}
// Again, adjust coefficients instead of doubling sum, sum2
ret += (sqrtHalfPi / n) * (sum / (36 * z2 * z2 * z2 * z) - sum2 / (18 * z2 * z));
// K_3(z) One more time with feeling - two doubly infinite sums, all k powers even.
// Multiply coefficient denominators by 2, so omit doubling sums.
final double pi6 = pi4 * MathUtils.PI_SQUARED;
sum = 0;
double kTerm4 = 0;
double kTerm6 = 0;
for (k = 0; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) {
kTerm = k + 0.5;
kTerm2 = kTerm * kTerm;
kTerm4 = kTerm2 * kTerm2;
kTerm6 = kTerm4 * kTerm2;
increment = (pi6 * kTerm6 * (5 - 30 * z2) + pi4 * kTerm4 * (-60 * z2 + 212 * z4) +
MathUtils.PI_SQUARED * kTerm2 * (135 * z4 - 96 * z6) - 30 * z6 - 90 * z8) *
FastMath.exp(-MathUtils.PI_SQUARED * kTerm2 / twoZ2);
sum += increment;
if (FastMath.abs(increment) < PG_SUM_RELATIVE_ERROR * FastMath.abs(sum)) {
break;
}
}
if (k == MAXIMUM_PARTIAL_SUM_COUNT) {
throw new TooManyIterationsException(MAXIMUM_PARTIAL_SUM_COUNT);
}
sum2 = 0;
for (k = 1; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) {
kTerm2 = k * k;
kTerm4 = kTerm2 * kTerm2;
increment = (-pi4 * kTerm4 + 3 * MathUtils.PI_SQUARED * kTerm2 * z2) *
FastMath.exp(-MathUtils.PI_SQUARED * kTerm2 / twoZ2);
sum2 += increment;
if (FastMath.abs(increment) < PG_SUM_RELATIVE_ERROR * FastMath.abs(sum2)) {
break;
}
}
if (k == MAXIMUM_PARTIAL_SUM_COUNT) {
throw new TooManyIterationsException(MAXIMUM_PARTIAL_SUM_COUNT);
}
return ret + (sqrtHalfPi / (sqrtN * n)) * (sum / (3240 * z6 * z4) +
+ sum2 / (108 * z6));
}
Creates H
of size m x m
as described in [1] (see above). Params: - d – statistic
- n – sample size
Throws: - NumberIsTooLargeException – if fractional part is greater than 1
- FractionConversionException – if algorithm fails to convert
h
to a BigFraction
in expressing d
as \((k - h) / m\) for integer k, m
and \(0 <= h < 1\).
Returns: H matrix
/***
* Creates {@code H} of size {@code m x m} as described in [1] (see above).
*
* @param d statistic
* @param n sample size
* @return H matrix
* @throws NumberIsTooLargeException if fractional part is greater than 1
* @throws FractionConversionException if algorithm fails to convert {@code h} to a
* {@link org.apache.commons.math3.fraction.BigFraction} in expressing {@code d} as \((k
* - h) / m\) for integer {@code k, m} and \(0 <= h < 1\).
*/
private FieldMatrix<BigFraction> createExactH(double d, int n)
throws NumberIsTooLargeException, FractionConversionException {
final int k = (int) Math.ceil(n * d);
final int m = 2 * k - 1;
final double hDouble = k - n * d;
if (hDouble >= 1) {
throw new NumberIsTooLargeException(hDouble, 1.0, false);
}
BigFraction h = null;
try {
h = new BigFraction(hDouble, 1.0e-20, 10000);
} catch (final FractionConversionException e1) {
try {
h = new BigFraction(hDouble, 1.0e-10, 10000);
} catch (final FractionConversionException e2) {
h = new BigFraction(hDouble, 1.0e-5, 10000);
}
}
final BigFraction[][] Hdata = new BigFraction[m][m];
/*
* Start by filling everything with either 0 or 1.
*/
for (int i = 0; i < m; ++i) {
for (int j = 0; j < m; ++j) {
if (i - j + 1 < 0) {
Hdata[i][j] = BigFraction.ZERO;
} else {
Hdata[i][j] = BigFraction.ONE;
}
}
}
/*
* Setting up power-array to avoid calculating the same value twice: hPowers[0] = h^1 ...
* hPowers[m-1] = h^m
*/
final BigFraction[] hPowers = new BigFraction[m];
hPowers[0] = h;
for (int i = 1; i < m; ++i) {
hPowers[i] = h.multiply(hPowers[i - 1]);
}
/*
* First column and last row has special values (each other reversed).
*/
for (int i = 0; i < m; ++i) {
Hdata[i][0] = Hdata[i][0].subtract(hPowers[i]);
Hdata[m - 1][i] = Hdata[m - 1][i].subtract(hPowers[m - i - 1]);
}
/*
* [1] states: "For 1/2 < h < 1 the bottom left element of the matrix should be (1 - 2*h^m +
* (2h - 1)^m )/m!" Since 0 <= h < 1, then if h > 1/2 is sufficient to check:
*/
if (h.compareTo(BigFraction.ONE_HALF) == 1) {
Hdata[m - 1][0] = Hdata[m - 1][0].add(h.multiply(2).subtract(1).pow(m));
}
/*
* Aside from the first column and last row, the (i, j)-th element is 1/(i - j + 1)! if i -
* j + 1 >= 0, else 0. 1's and 0's are already put, so only division with (i - j + 1)! is
* needed in the elements that have 1's. There is no need to calculate (i - j + 1)! and then
* divide - small steps avoid overflows. Note that i - j + 1 > 0 <=> i + 1 > j instead of
* j'ing all the way to m. Also note that it is started at g = 2 because dividing by 1 isn't
* really necessary.
*/
for (int i = 0; i < m; ++i) {
for (int j = 0; j < i + 1; ++j) {
if (i - j + 1 > 0) {
for (int g = 2; g <= i - j + 1; ++g) {
Hdata[i][j] = Hdata[i][j].divide(g);
}
}
}
}
return new Array2DRowFieldMatrix<BigFraction>(BigFractionField.getInstance(), Hdata);
}
Creates H
of size m x m
as described in [1] (see above) using double-precision. Params: - d – statistic
- n – sample size
Throws: - NumberIsTooLargeException – if fractional part is greater than 1
Returns: H matrix
/***
* Creates {@code H} of size {@code m x m} as described in [1] (see above)
* using double-precision.
*
* @param d statistic
* @param n sample size
* @return H matrix
* @throws NumberIsTooLargeException if fractional part is greater than 1
*/
private RealMatrix createRoundedH(double d, int n)
throws NumberIsTooLargeException {
final int k = (int) Math.ceil(n * d);
final int m = 2 * k - 1;
final double h = k - n * d;
if (h >= 1) {
throw new NumberIsTooLargeException(h, 1.0, false);
}
final double[][] Hdata = new double[m][m];
/*
* Start by filling everything with either 0 or 1.
*/
for (int i = 0; i < m; ++i) {
for (int j = 0; j < m; ++j) {
if (i - j + 1 < 0) {
Hdata[i][j] = 0;
} else {
Hdata[i][j] = 1;
}
}
}
/*
* Setting up power-array to avoid calculating the same value twice: hPowers[0] = h^1 ...
* hPowers[m-1] = h^m
*/
final double[] hPowers = new double[m];
hPowers[0] = h;
for (int i = 1; i < m; ++i) {
hPowers[i] = h * hPowers[i - 1];
}
/*
* First column and last row has special values (each other reversed).
*/
for (int i = 0; i < m; ++i) {
Hdata[i][0] = Hdata[i][0] - hPowers[i];
Hdata[m - 1][i] -= hPowers[m - i - 1];
}
/*
* [1] states: "For 1/2 < h < 1 the bottom left element of the matrix should be (1 - 2*h^m +
* (2h - 1)^m )/m!" Since 0 <= h < 1, then if h > 1/2 is sufficient to check:
*/
if (Double.compare(h, 0.5) > 0) {
Hdata[m - 1][0] += FastMath.pow(2 * h - 1, m);
}
/*
* Aside from the first column and last row, the (i, j)-th element is 1/(i - j + 1)! if i -
* j + 1 >= 0, else 0. 1's and 0's are already put, so only division with (i - j + 1)! is
* needed in the elements that have 1's. There is no need to calculate (i - j + 1)! and then
* divide - small steps avoid overflows. Note that i - j + 1 > 0 <=> i + 1 > j instead of
* j'ing all the way to m. Also note that it is started at g = 2 because dividing by 1 isn't
* really necessary.
*/
for (int i = 0; i < m; ++i) {
for (int j = 0; j < i + 1; ++j) {
if (i - j + 1 > 0) {
for (int g = 2; g <= i - j + 1; ++g) {
Hdata[i][j] /= g;
}
}
}
}
return MatrixUtils.createRealMatrix(Hdata);
}
Verifies that array
has length at least 2. Params: - array – array to test
Throws: - NullArgumentException – if array is null
- InsufficientDataException – if array is too short
/**
* Verifies that {@code array} has length at least 2.
*
* @param array array to test
* @throws NullArgumentException if array is null
* @throws InsufficientDataException if array is too short
*/
private void checkArray(double[] array) {
if (array == null) {
throw new NullArgumentException(LocalizedFormats.NULL_NOT_ALLOWED);
}
if (array.length < 2) {
throw new InsufficientDataException(LocalizedFormats.INSUFFICIENT_OBSERVED_POINTS_IN_SAMPLE, array.length,
2);
}
}
Computes \( 1 + 2 \sum_{i=1}^\infty (-1)^i e^{-2 i^2 t^2} \) stopping when successive partial sums are within tolerance
of one another, or when maxIterations
partial sums have been computed. If the sum does not converge before maxIterations
iterations a TooManyIterationsException
is thrown. Params: - t – argument
- tolerance – Cauchy criterion for partial sums
- maxIterations – maximum number of partial sums to compute
Throws: - TooManyIterationsException – if the series does not converge
Returns: Kolmogorov sum evaluated at t
/**
* Computes \( 1 + 2 \sum_{i=1}^\infty (-1)^i e^{-2 i^2 t^2} \) stopping when successive partial
* sums are within {@code tolerance} of one another, or when {@code maxIterations} partial sums
* have been computed. If the sum does not converge before {@code maxIterations} iterations a
* {@link TooManyIterationsException} is thrown.
*
* @param t argument
* @param tolerance Cauchy criterion for partial sums
* @param maxIterations maximum number of partial sums to compute
* @return Kolmogorov sum evaluated at t
* @throws TooManyIterationsException if the series does not converge
*/
public double ksSum(double t, double tolerance, int maxIterations) {
if (t == 0.0) {
return 0.0;
}
// TODO: for small t (say less than 1), the alternative expansion in part 3 of [1]
// from class javadoc should be used.
final double x = -2 * t * t;
int sign = -1;
long i = 1;
double partialSum = 0.5d;
double delta = 1;
while (delta > tolerance && i < maxIterations) {
delta = FastMath.exp(x * i * i);
partialSum += sign * delta;
sign *= -1;
i++;
}
if (i == maxIterations) {
throw new TooManyIterationsException(maxIterations);
}
return partialSum * 2;
}
Given a d-statistic in the range [0, 1] and the two sample sizes n and m, an integral d-statistic in the range [0, n*m] is calculated, that can be used for comparison with other integral d-statistics. Depending whether strict
is true
or not, the returned value divided by (n*m) is greater than (resp greater than or equal to) the given d value (allowing some tolerance). Params: - d – a d-statistic in the range [0, 1]
- n – first sample size
- m – second sample size
- strict – whether the returned value divided by (n*m) is allowed to be equal to d
Returns: the integral d-statistic in the range [0, n*m]
/**
* Given a d-statistic in the range [0, 1] and the two sample sizes n and m,
* an integral d-statistic in the range [0, n*m] is calculated, that can be used for
* comparison with other integral d-statistics. Depending whether {@code strict} is
* {@code true} or not, the returned value divided by (n*m) is greater than
* (resp greater than or equal to) the given d value (allowing some tolerance).
*
* @param d a d-statistic in the range [0, 1]
* @param n first sample size
* @param m second sample size
* @param strict whether the returned value divided by (n*m) is allowed to be equal to d
* @return the integral d-statistic in the range [0, n*m]
*/
private static long calculateIntegralD(double d, int n, int m, boolean strict) {
final double tol = 1e-12; // d-values within tol of one another are considered equal
long nm = n * (long)m;
long upperBound = (long)FastMath.ceil((d - tol) * nm);
long lowerBound = (long)FastMath.floor((d + tol) * nm);
if (strict && lowerBound == upperBound) {
return upperBound + 1l;
}
else {
return upperBound;
}
}
Computes \(P(D_{n,m} > d)\) if strict
is true
; otherwise \(P(D_{n,m} \ge d)\), where \(D_{n,m}\) is the 2-sample Kolmogorov-Smirnov statistic. See kolmogorovSmirnovStatistic(double[], double[])
for the definition of \(D_{n,m}\).
The returned probability is exact, implemented by unwinding the recursive function
definitions presented in [4] (class javadoc).
Params: - d – D-statistic value
- n – first sample size
- m – second sample size
- strict – whether or not the probability to compute is expressed as a strict inequality
Returns: probability that a randomly selected m-n partition of m + n generates \(D_{n,m}\) greater than (resp. greater than or equal to) d
/**
* Computes \(P(D_{n,m} > d)\) if {@code strict} is {@code true}; otherwise \(P(D_{n,m} \ge
* d)\), where \(D_{n,m}\) is the 2-sample Kolmogorov-Smirnov statistic. See
* {@link #kolmogorovSmirnovStatistic(double[], double[])} for the definition of \(D_{n,m}\).
* <p>
* The returned probability is exact, implemented by unwinding the recursive function
* definitions presented in [4] (class javadoc).
* </p>
*
* @param d D-statistic value
* @param n first sample size
* @param m second sample size
* @param strict whether or not the probability to compute is expressed as a strict inequality
* @return probability that a randomly selected m-n partition of m + n generates \(D_{n,m}\)
* greater than (resp. greater than or equal to) {@code d}
*/
public double exactP(double d, int n, int m, boolean strict) {
return 1 - n(m, n, m, n, calculateIntegralD(d, m, n, strict), strict) /
CombinatoricsUtils.binomialCoefficientDouble(n + m, m);
}
Uses the Kolmogorov-Smirnov distribution to approximate \(P(D_{n,m} > d)\) where \(D_{n,m}\) is the 2-sample Kolmogorov-Smirnov statistic. See kolmogorovSmirnovStatistic(double[], double[])
for the definition of \(D_{n,m}\). Specifically, what is returned is \(1 - k(d \sqrt{mn / (m + n)})\) where \(k(t) = 1 + 2 \sum_{i=1}^\infty (-1)^i e^{-2 i^2 t^2}\). See ksSum(double, double, int)
for details on how convergence of the sum is determined. This implementation passes ksSum
1.0E-20 as tolerance
and 100000 as maxIterations
.
Params: - d – D-statistic value
- n – first sample size
- m – second sample size
Returns: approximate probability that a randomly selected m-n partition of m + n generates \(D_{n,m}\) greater than d
/**
* Uses the Kolmogorov-Smirnov distribution to approximate \(P(D_{n,m} > d)\) where \(D_{n,m}\)
* is the 2-sample Kolmogorov-Smirnov statistic. See
* {@link #kolmogorovSmirnovStatistic(double[], double[])} for the definition of \(D_{n,m}\).
* <p>
* Specifically, what is returned is \(1 - k(d \sqrt{mn / (m + n)})\) where \(k(t) = 1 + 2
* \sum_{i=1}^\infty (-1)^i e^{-2 i^2 t^2}\). See {@link #ksSum(double, double, int)} for
* details on how convergence of the sum is determined. This implementation passes {@code ksSum}
* {@value #KS_SUM_CAUCHY_CRITERION} as {@code tolerance} and
* {@value #MAXIMUM_PARTIAL_SUM_COUNT} as {@code maxIterations}.
* </p>
*
* @param d D-statistic value
* @param n first sample size
* @param m second sample size
* @return approximate probability that a randomly selected m-n partition of m + n generates
* \(D_{n,m}\) greater than {@code d}
*/
public double approximateP(double d, int n, int m) {
final double dm = m;
final double dn = n;
return 1 - ksSum(d * FastMath.sqrt((dm * dn) / (dm + dn)),
KS_SUM_CAUCHY_CRITERION, MAXIMUM_PARTIAL_SUM_COUNT);
}
Fills a boolean array randomly with a fixed number of true
values. The method uses a simplified version of the Fisher-Yates shuffle algorithm. By processing first the true
values followed by the remaining false
values less random numbers need to be generated. The method is optimized for the case that the number of true
values is larger than or equal to the number of false
values. Params: - b – boolean array
- numberOfTrueValues – number of
true
values the boolean array should finally have - rng – random data generator
/**
* Fills a boolean array randomly with a fixed number of {@code true} values.
* The method uses a simplified version of the Fisher-Yates shuffle algorithm.
* By processing first the {@code true} values followed by the remaining {@code false} values
* less random numbers need to be generated. The method is optimized for the case
* that the number of {@code true} values is larger than or equal to the number of
* {@code false} values.
*
* @param b boolean array
* @param numberOfTrueValues number of {@code true} values the boolean array should finally have
* @param rng random data generator
*/
static void fillBooleanArrayRandomlyWithFixedNumberTrueValues(final boolean[] b, final int numberOfTrueValues, final RandomGenerator rng) {
Arrays.fill(b, true);
for (int k = numberOfTrueValues; k < b.length; k++) {
final int r = rng.nextInt(k + 1);
b[(b[r]) ? r : k] = false;
}
}
Uses Monte Carlo simulation to approximate \(P(D_{n,m} > d)\) where \(D_{n,m}\) is the 2-sample Kolmogorov-Smirnov statistic. See kolmogorovSmirnovStatistic(double[], double[])
for the definition of \(D_{n,m}\). The simulation generates iterations
random partitions of m + n
into an n
set and an m
set, computing \(D_{n,m}\) for each partition and returning the proportion of values that are greater than d
, or greater than or equal to d
if strict
is false
.
Params: - d – D-statistic value
- n – first sample size
- m – second sample size
- iterations – number of random partitions to generate
- strict – whether or not the probability to compute is expressed as a strict inequality
Returns: proportion of randomly generated m-n partitions of m + n that result in \(D_{n,m}\) greater than (resp. greater than or equal to) d
/**
* Uses Monte Carlo simulation to approximate \(P(D_{n,m} > d)\) where \(D_{n,m}\) is the
* 2-sample Kolmogorov-Smirnov statistic. See
* {@link #kolmogorovSmirnovStatistic(double[], double[])} for the definition of \(D_{n,m}\).
* <p>
* The simulation generates {@code iterations} random partitions of {@code m + n} into an
* {@code n} set and an {@code m} set, computing \(D_{n,m}\) for each partition and returning
* the proportion of values that are greater than {@code d}, or greater than or equal to
* {@code d} if {@code strict} is {@code false}.
* </p>
*
* @param d D-statistic value
* @param n first sample size
* @param m second sample size
* @param iterations number of random partitions to generate
* @param strict whether or not the probability to compute is expressed as a strict inequality
* @return proportion of randomly generated m-n partitions of m + n that result in \(D_{n,m}\)
* greater than (resp. greater than or equal to) {@code d}
*/
public double monteCarloP(final double d, final int n, final int m, final boolean strict,
final int iterations) {
return integralMonteCarloP(calculateIntegralD(d, n, m, strict), n, m, iterations);
}
Uses Monte Carlo simulation to approximate \(P(D_{n,m} >= d/(n*m))\) where \(D_{n,m}\) is the
2-sample Kolmogorov-Smirnov statistic.
Here d is the D-statistic represented as long value. The real D-statistic is obtained by dividing d by n*m. See also monteCarloP(double, int, int, boolean, int)
.
Params: - d – integral D-statistic
- n – first sample size
- m – second sample size
- iterations – number of random partitions to generate
Returns: proportion of randomly generated m-n partitions of m + n that result in \(D_{n,m}\) greater than or equal to d/(n*m))
/**
* Uses Monte Carlo simulation to approximate \(P(D_{n,m} >= d/(n*m))\) where \(D_{n,m}\) is the
* 2-sample Kolmogorov-Smirnov statistic.
* <p>
* Here d is the D-statistic represented as long value.
* The real D-statistic is obtained by dividing d by n*m.
* See also {@link #monteCarloP(double, int, int, boolean, int)}.
*
* @param d integral D-statistic
* @param n first sample size
* @param m second sample size
* @param iterations number of random partitions to generate
* @return proportion of randomly generated m-n partitions of m + n that result in \(D_{n,m}\)
* greater than or equal to {@code d/(n*m))}
*/
private double integralMonteCarloP(final long d, final int n, final int m, final int iterations) {
// ensure that nn is always the max of (n, m) to require fewer random numbers
final int nn = FastMath.max(n, m);
final int mm = FastMath.min(n, m);
final int sum = nn + mm;
int tail = 0;
final boolean b[] = new boolean[sum];
for (int i = 0; i < iterations; i++) {
fillBooleanArrayRandomlyWithFixedNumberTrueValues(b, nn, rng);
long curD = 0l;
for(int j = 0; j < b.length; ++j) {
if (b[j]) {
curD += mm;
if (curD >= d) {
tail++;
break;
}
} else {
curD -= nn;
if (curD <= -d) {
tail++;
break;
}
}
}
}
return (double) tail / iterations;
}
If there are no ties in the combined dataset formed from x and y, this
method is a no-op. If there are ties, a uniform random deviate in
(-minDelta / 2, minDelta / 2) - {0} is added to each value in x and y, where
minDelta is the minimum difference between unequal values in the combined
sample. A fixed seed is used to generate the jitter, so repeated activations
with the same input arrays result in the same values.
NOTE: if there are ties in the data, this method overwrites the data in
x and y with the jittered values.
Params: - x – first sample
- y – second sample
/**
* If there are no ties in the combined dataset formed from x and y, this
* method is a no-op. If there are ties, a uniform random deviate in
* (-minDelta / 2, minDelta / 2) - {0} is added to each value in x and y, where
* minDelta is the minimum difference between unequal values in the combined
* sample. A fixed seed is used to generate the jitter, so repeated activations
* with the same input arrays result in the same values.
*
* NOTE: if there are ties in the data, this method overwrites the data in
* x and y with the jittered values.
*
* @param x first sample
* @param y second sample
*/
private static void fixTies(double[] x, double[] y) {
final double[] values = MathArrays.unique(MathArrays.concatenate(x,y));
if (values.length == x.length + y.length) {
return; // There are no ties
}
// Find the smallest difference between values, or 1 if all values are the same
double minDelta = 1;
double prev = values[0];
double delta = 1;
for (int i = 1; i < values.length; i++) {
delta = prev - values[i];
if (delta < minDelta) {
minDelta = delta;
}
prev = values[i];
}
minDelta /= 2;
// Add jitter using a fixed seed (so same arguments always give same results),
// low-initialization-overhead generator
final RealDistribution dist =
new UniformRealDistribution(new JDKRandomGenerator(100), -minDelta, minDelta);
// It is theoretically possible that jitter does not break ties, so repeat
// until all ties are gone. Bound the loop and throw MIE if bound is exceeded.
int ct = 0;
boolean ties = true;
do {
jitter(x, dist);
jitter(y, dist);
ties = hasTies(x, y);
ct++;
} while (ties && ct < 1000);
if (ties) {
throw new MathInternalError(); // Should never happen
}
}
Returns true iff there are ties in the combined sample
formed from x and y.
Params: - x – first sample
- y – second sample
Returns: true if x and y together contain ties
/**
* Returns true iff there are ties in the combined sample
* formed from x and y.
*
* @param x first sample
* @param y second sample
* @return true if x and y together contain ties
*/
private static boolean hasTies(double[] x, double[] y) {
final HashSet<Double> values = new HashSet<Double>();
for (int i = 0; i < x.length; i++) {
if (!values.add(x[i])) {
return true;
}
}
for (int i = 0; i < y.length; i++) {
if (!values.add(y[i])) {
return true;
}
}
return false;
}
Adds random jitter to data
using deviates sampled from dist
.
Note that jitter is applied in-place - i.e., the array
values are overwritten with the result of applying jitter.
Params: - data – input/output data array - entries overwritten by the method
- dist – probability distribution to sample for jitter values
Throws: - NullPointerException – if either of the parameters is null
/**
* Adds random jitter to {@code data} using deviates sampled from {@code dist}.
* <p>
* Note that jitter is applied in-place - i.e., the array
* values are overwritten with the result of applying jitter.</p>
*
* @param data input/output data array - entries overwritten by the method
* @param dist probability distribution to sample for jitter values
* @throws NullPointerException if either of the parameters is null
*/
private static void jitter(double[] data, RealDistribution dist) {
for (int i = 0; i < data.length; i++) {
data[i] += dist.sample();
}
}
The function C(i, j) defined in [4] (class javadoc), formula (5.5).
defined to return 1 if |i/n - j/m| <= c; 0 otherwise. Here c is scaled up
and recoded as a long to avoid rounding errors in comparison tests, so what
is actually tested is |im - jn| <= cmn.
Params: - i – first path parameter
- j – second path paramter
- m – first sample size
- n – second sample size
- cmn – integral D-statistic (see
calculateIntegralD(double, int, int, boolean)
) - strict – whether or not the null hypothesis uses strict inequality
Returns: C(i,j) for given m, n, c
/**
* The function C(i, j) defined in [4] (class javadoc), formula (5.5).
* defined to return 1 if |i/n - j/m| <= c; 0 otherwise. Here c is scaled up
* and recoded as a long to avoid rounding errors in comparison tests, so what
* is actually tested is |im - jn| <= cmn.
*
* @param i first path parameter
* @param j second path paramter
* @param m first sample size
* @param n second sample size
* @param cmn integral D-statistic (see {@link #calculateIntegralD(double, int, int, boolean)})
* @param strict whether or not the null hypothesis uses strict inequality
* @return C(i,j) for given m, n, c
*/
private static int c(int i, int j, int m, int n, long cmn, boolean strict) {
if (strict) {
return FastMath.abs(i*(long)n - j*(long)m) <= cmn ? 1 : 0;
}
return FastMath.abs(i*(long)n - j*(long)m) < cmn ? 1 : 0;
}
The function N(i, j) defined in [4] (class javadoc).
Returns the number of paths over the lattice {(i,j) : 0 <= i <= n, 0 <= j <= m}
from (0,0) to (i,j) satisfying C(h,k, m, n, c) = 1 for each (h,k) on the path.
The return value is integral, but subject to overflow, so it is maintained and
returned as a double.
Params: - i – first path parameter
- j – second path parameter
- m – first sample size
- n – second sample size
- cnm – integral D-statistic (see
calculateIntegralD(double, int, int, boolean)
) - strict – whether or not the null hypothesis uses strict inequality
Returns: number or paths to (i, j) from (0,0) representing D-values as large as c for given m, n
/**
* The function N(i, j) defined in [4] (class javadoc).
* Returns the number of paths over the lattice {(i,j) : 0 <= i <= n, 0 <= j <= m}
* from (0,0) to (i,j) satisfying C(h,k, m, n, c) = 1 for each (h,k) on the path.
* The return value is integral, but subject to overflow, so it is maintained and
* returned as a double.
*
* @param i first path parameter
* @param j second path parameter
* @param m first sample size
* @param n second sample size
* @param cnm integral D-statistic (see {@link #calculateIntegralD(double, int, int, boolean)})
* @param strict whether or not the null hypothesis uses strict inequality
* @return number or paths to (i, j) from (0,0) representing D-values as large as c for given m, n
*/
private static double n(int i, int j, int m, int n, long cnm, boolean strict) {
/*
* Unwind the recursive definition given in [4].
* Compute n(1,1), n(1,2)...n(2,1), n(2,2)... up to n(i,j), one row at a time.
* When n(i,*) are being computed, lag[] holds the values of n(i - 1, *).
*/
final double[] lag = new double[n];
double last = 0;
for (int k = 0; k < n; k++) {
lag[k] = c(0, k + 1, m, n, cnm, strict);
}
for (int k = 1; k <= i; k++) {
last = c(k, 0, m, n, cnm, strict);
for (int l = 1; l <= j; l++) {
lag[l - 1] = c(k, l, m, n, cnm, strict) * (last + lag[l - 1]);
last = lag[l - 1];
}
}
return last;
}
}