001    /*
002     * Licensed to the Apache Software Foundation (ASF) under one or more
003     * contributor license agreements.  See the NOTICE file distributed with
004     * this work for additional information regarding copyright ownership.
005     * The ASF licenses this file to You under the Apache License, Version 2.0
006     * (the "License"); you may not use this file except in compliance with
007     * the License.  You may obtain a copy of the License at
008     *
009     *      http://www.apache.org/licenses/LICENSE-2.0
010     *
011     * Unless required by applicable law or agreed to in writing, software
012     * distributed under the License is distributed on an "AS IS" BASIS,
013     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014     * See the License for the specific language governing permissions and
015     * limitations under the License.
016     */
017    package org.apache.commons.math.stat.descriptive.moment;
018    
019    import java.io.Serializable;
020    
021    import org.apache.commons.math.exception.NullArgumentException;
022    import org.apache.commons.math.exception.util.LocalizedFormats;
023    import org.apache.commons.math.stat.descriptive.WeightedEvaluation;
024    import org.apache.commons.math.stat.descriptive.AbstractStorelessUnivariateStatistic;
025    
026    /**
027     * Computes the variance of the available values.  By default, the unbiased
028     * "sample variance" definitional formula is used:
029     * <p>
030     * variance = sum((x_i - mean)^2) / (n - 1) </p>
031     * <p>
032     * where mean is the {@link Mean} and <code>n</code> is the number
033     * of sample observations.</p>
034     * <p>
035     * The definitional formula does not have good numerical properties, so
036     * this implementation does not compute the statistic using the definitional
037     * formula. <ul>
038     * <li> The <code>getResult</code> method computes the variance using
039     * updating formulas based on West's algorithm, as described in
040     * <a href="http://doi.acm.org/10.1145/359146.359152"> Chan, T. F. and
041     * J. G. Lewis 1979, <i>Communications of the ACM</i>,
042     * vol. 22 no. 9, pp. 526-531.</a></li>
043     * <li> The <code>evaluate</code> methods leverage the fact that they have the
044     * full array of values in memory to execute a two-pass algorithm.
045     * Specifically, these methods use the "corrected two-pass algorithm" from
046     * Chan, Golub, Levesque, <i>Algorithms for Computing the Sample Variance</i>,
047     * American Statistician, vol. 37, no. 3 (1983) pp. 242-247.</li></ul>
048     * Note that adding values using <code>increment</code> or
049     * <code>incrementAll</code> and then executing <code>getResult</code> will
050     * sometimes give a different, less accurate, result than executing
051     * <code>evaluate</code> with the full array of values. The former approach
052     * should only be used when the full array of values is not available.</p>
053     * <p>
054     * The "population variance"  ( sum((x_i - mean)^2) / n ) can also
055     * be computed using this statistic.  The <code>isBiasCorrected</code>
056     * property determines whether the "population" or "sample" value is
057     * returned by the <code>evaluate</code> and <code>getResult</code> methods.
058     * To compute population variances, set this property to <code>false.</code>
059     * </p>
060     * <p>
061     * <strong>Note that this implementation is not synchronized.</strong> If
062     * multiple threads access an instance of this class concurrently, and at least
063     * one of the threads invokes the <code>increment()</code> or
064     * <code>clear()</code> method, it must be synchronized externally.</p>
065     *
066     * @version $Revision: 1006299 $ $Date: 2010-10-10 16:47:17 +0200 (dim. 10 oct. 2010) $
067     */
068    public class Variance extends AbstractStorelessUnivariateStatistic implements Serializable, WeightedEvaluation {
069    
070        /** Serializable version identifier */
071        private static final long serialVersionUID = -9111962718267217978L;
072    
073        /** SecondMoment is used in incremental calculation of Variance*/
074        protected SecondMoment moment = null;
075    
076        /**
077         * Boolean test to determine if this Variance should also increment
078         * the second moment, this evaluates to false when this Variance is
079         * constructed with an external SecondMoment as a parameter.
080         */
081        protected boolean incMoment = true;
082    
083        /**
084         * Determines whether or not bias correction is applied when computing the
085         * value of the statisic.  True means that bias is corrected.  See
086         * {@link Variance} for details on the formula.
087         */
088        private boolean isBiasCorrected = true;
089    
090        /**
091         * Constructs a Variance with default (true) <code>isBiasCorrected</code>
092         * property.
093         */
094        public Variance() {
095            moment = new SecondMoment();
096        }
097    
098        /**
099         * Constructs a Variance based on an external second moment.
100         *
101         * @param m2 the SecondMoment (Third or Fourth moments work
102         * here as well.)
103         */
104        public Variance(final SecondMoment m2) {
105            incMoment = false;
106            this.moment = m2;
107        }
108    
109        /**
110         * Constructs a Variance with the specified <code>isBiasCorrected</code>
111         * property
112         *
113         * @param isBiasCorrected  setting for bias correction - true means
114         * bias will be corrected and is equivalent to using the argumentless
115         * constructor
116         */
117        public Variance(boolean isBiasCorrected) {
118            moment = new SecondMoment();
119            this.isBiasCorrected = isBiasCorrected;
120        }
121    
122        /**
123         * Constructs a Variance with the specified <code>isBiasCorrected</code>
124         * property and the supplied external second moment.
125         *
126         * @param isBiasCorrected  setting for bias correction - true means
127         * bias will be corrected
128         * @param m2 the SecondMoment (Third or Fourth moments work
129         * here as well.)
130         */
131        public Variance(boolean isBiasCorrected, SecondMoment m2) {
132            incMoment = false;
133            this.moment = m2;
134            this.isBiasCorrected = isBiasCorrected;
135        }
136    
137        /**
138         * Copy constructor, creates a new {@code Variance} identical
139         * to the {@code original}
140         *
141         * @param original the {@code Variance} instance to copy
142         */
143        public Variance(Variance original) {
144            copy(original, this);
145        }
146    
147        /**
148         * {@inheritDoc}
149         * <p>If all values are available, it is more accurate to use
150         * {@link #evaluate(double[])} rather than adding values one at a time
151         * using this method and then executing {@link #getResult}, since
152         * <code>evaluate</code> leverages the fact that is has the full
153         * list of values together to execute a two-pass algorithm.
154         * See {@link Variance}.</p>
155         */
156        @Override
157        public void increment(final double d) {
158            if (incMoment) {
159                moment.increment(d);
160            }
161        }
162    
163        /**
164         * {@inheritDoc}
165         */
166        @Override
167        public double getResult() {
168                if (moment.n == 0) {
169                    return Double.NaN;
170                } else if (moment.n == 1) {
171                    return 0d;
172                } else {
173                    if (isBiasCorrected) {
174                        return moment.m2 / (moment.n - 1d);
175                    } else {
176                        return moment.m2 / (moment.n);
177                    }
178                }
179        }
180    
181        /**
182         * {@inheritDoc}
183         */
184        public long getN() {
185            return moment.getN();
186        }
187    
188        /**
189         * {@inheritDoc}
190         */
191        @Override
192        public void clear() {
193            if (incMoment) {
194                moment.clear();
195            }
196        }
197    
198        /**
199         * Returns the variance of the entries in the input array, or
200         * <code>Double.NaN</code> if the array is empty.
201         * <p>
202         * See {@link Variance} for details on the computing algorithm.</p>
203         * <p>
204         * Returns 0 for a single-value (i.e. length = 1) sample.</p>
205         * <p>
206         * Throws <code>IllegalArgumentException</code> if the array is null.</p>
207         * <p>
208         * Does not change the internal state of the statistic.</p>
209         *
210         * @param values the input array
211         * @return the variance of the values or Double.NaN if length = 0
212         * @throws IllegalArgumentException if the array is null
213         */
214        @Override
215        public double evaluate(final double[] values) {
216            if (values == null) {
217                throw new NullArgumentException(LocalizedFormats.INPUT_ARRAY);
218            }
219            return evaluate(values, 0, values.length);
220        }
221    
222        /**
223         * Returns the variance of the entries in the specified portion of
224         * the input array, or <code>Double.NaN</code> if the designated subarray
225         * is empty.
226         * <p>
227         * See {@link Variance} for details on the computing algorithm.</p>
228         * <p>
229         * Returns 0 for a single-value (i.e. length = 1) sample.</p>
230         * <p>
231         * Does not change the internal state of the statistic.</p>
232         * <p>
233         * Throws <code>IllegalArgumentException</code> if the array is null.</p>
234         *
235         * @param values the input array
236         * @param begin index of the first array element to include
237         * @param length the number of elements to include
238         * @return the variance of the values or Double.NaN if length = 0
239         * @throws IllegalArgumentException if the array is null or the array index
240         *  parameters are not valid
241         */
242        @Override
243        public double evaluate(final double[] values, final int begin, final int length) {
244    
245            double var = Double.NaN;
246    
247            if (test(values, begin, length)) {
248                clear();
249                if (length == 1) {
250                    var = 0.0;
251                } else if (length > 1) {
252                    Mean mean = new Mean();
253                    double m = mean.evaluate(values, begin, length);
254                    var = evaluate(values, m, begin, length);
255                }
256            }
257            return var;
258        }
259    
260        /**
261         * <p>Returns the weighted variance of the entries in the specified portion of
262         * the input array, or <code>Double.NaN</code> if the designated subarray
263         * is empty.</p>
264         * <p>
265         * Uses the formula <pre>
266         *   Σ(weights[i]*(values[i] - weightedMean)<sup>2</sup>)/(Σ(weights[i]) - 1)
267         * </pre>
268         * where weightedMean is the weighted mean</p>
269         * <p>
270         * This formula will not return the same result as the unweighted variance when all
271         * weights are equal, unless all weights are equal to 1. The formula assumes that
272         * weights are to be treated as "expansion values," as will be the case if for example
273         * the weights represent frequency counts. To normalize weights so that the denominator
274         * in the variance computation equals the length of the input vector minus one, use <pre>
275         *   <code>evaluate(values, MathUtils.normalizeArray(weights, values.length)); </code>
276         * </pre>
277         * <p>
278         * Returns 0 for a single-value (i.e. length = 1) sample.</p>
279         * <p>
280         * Throws <code>IllegalArgumentException</code> if any of the following are true:
281         * <ul><li>the values array is null</li>
282         *     <li>the weights array is null</li>
283         *     <li>the weights array does not have the same length as the values array</li>
284         *     <li>the weights array contains one or more infinite values</li>
285         *     <li>the weights array contains one or more NaN values</li>
286         *     <li>the weights array contains negative values</li>
287         *     <li>the start and length arguments do not determine a valid array</li>
288         * </ul></p>
289         * <p>
290         * Does not change the internal state of the statistic.</p>
291         * <p>
292         * Throws <code>IllegalArgumentException</code> if either array is null.</p>
293         *
294         * @param values the input array
295         * @param weights the weights array
296         * @param begin index of the first array element to include
297         * @param length the number of elements to include
298         * @return the weighted variance of the values or Double.NaN if length = 0
299         * @throws IllegalArgumentException if the parameters are not valid
300         * @since 2.1
301         */
302        public double evaluate(final double[] values, final double[] weights,
303                               final int begin, final int length) {
304    
305            double var = Double.NaN;
306    
307            if (test(values, weights,begin, length)) {
308                clear();
309                if (length == 1) {
310                    var = 0.0;
311                } else if (length > 1) {
312                    Mean mean = new Mean();
313                    double m = mean.evaluate(values, weights, begin, length);
314                    var = evaluate(values, weights, m, begin, length);
315                }
316            }
317            return var;
318        }
319    
320        /**
321         * <p>
322         * Returns the weighted variance of the entries in the the input array.</p>
323         * <p>
324         * Uses the formula <pre>
325         *   Σ(weights[i]*(values[i] - weightedMean)<sup>2</sup>)/(Σ(weights[i]) - 1)
326         * </pre>
327         * where weightedMean is the weighted mean</p>
328         * <p>
329         * This formula will not return the same result as the unweighted variance when all
330         * weights are equal, unless all weights are equal to 1. The formula assumes that
331         * weights are to be treated as "expansion values," as will be the case if for example
332         * the weights represent frequency counts. To normalize weights so that the denominator
333         * in the variance computation equals the length of the input vector minus one, use <pre>
334         *   <code>evaluate(values, MathUtils.normalizeArray(weights, values.length)); </code>
335         * </pre>
336         * <p>
337         * Returns 0 for a single-value (i.e. length = 1) sample.</p>
338         * <p>
339         * Throws <code>IllegalArgumentException</code> if any of the following are true:
340         * <ul><li>the values array is null</li>
341         *     <li>the weights array is null</li>
342         *     <li>the weights array does not have the same length as the values array</li>
343         *     <li>the weights array contains one or more infinite values</li>
344         *     <li>the weights array contains one or more NaN values</li>
345         *     <li>the weights array contains negative values</li>
346         * </ul></p>
347         * <p>
348         * Does not change the internal state of the statistic.</p>
349         * <p>
350         * Throws <code>IllegalArgumentException</code> if either array is null.</p>
351         *
352         * @param values the input array
353         * @param weights the weights array
354         * @return the weighted variance of the values
355         * @throws IllegalArgumentException if the parameters are not valid
356         * @since 2.1
357         */
358        public double evaluate(final double[] values, final double[] weights) {
359            return evaluate(values, weights, 0, values.length);
360        }
361    
362        /**
363         * Returns the variance of the entries in the specified portion of
364         * the input array, using the precomputed mean value.  Returns
365         * <code>Double.NaN</code> if the designated subarray is empty.
366         * <p>
367         * See {@link Variance} for details on the computing algorithm.</p>
368         * <p>
369         * The formula used assumes that the supplied mean value is the arithmetic
370         * mean of the sample data, not a known population parameter.  This method
371         * is supplied only to save computation when the mean has already been
372         * computed.</p>
373         * <p>
374         * Returns 0 for a single-value (i.e. length = 1) sample.</p>
375         * <p>
376         * Throws <code>IllegalArgumentException</code> if the array is null.</p>
377         * <p>
378         * Does not change the internal state of the statistic.</p>
379         *
380         * @param values the input array
381         * @param mean the precomputed mean value
382         * @param begin index of the first array element to include
383         * @param length the number of elements to include
384         * @return the variance of the values or Double.NaN if length = 0
385         * @throws IllegalArgumentException if the array is null or the array index
386         *  parameters are not valid
387         */
388        public double evaluate(final double[] values, final double mean,
389                final int begin, final int length) {
390    
391            double var = Double.NaN;
392    
393            if (test(values, begin, length)) {
394                if (length == 1) {
395                    var = 0.0;
396                } else if (length > 1) {
397                    double accum = 0.0;
398                    double dev = 0.0;
399                    double accum2 = 0.0;
400                    for (int i = begin; i < begin + length; i++) {
401                        dev = values[i] - mean;
402                        accum += dev * dev;
403                        accum2 += dev;
404                    }
405                    double len = length;
406                    if (isBiasCorrected) {
407                        var = (accum - (accum2 * accum2 / len)) / (len - 1.0);
408                    } else {
409                        var = (accum - (accum2 * accum2 / len)) / len;
410                    }
411                }
412            }
413            return var;
414        }
415    
416        /**
417         * Returns the variance of the entries in the input array, using the
418         * precomputed mean value.  Returns <code>Double.NaN</code> if the array
419         * is empty.
420         * <p>
421         * See {@link Variance} for details on the computing algorithm.</p>
422         * <p>
423         * If <code>isBiasCorrected</code> is <code>true</code> the formula used
424         * assumes that the supplied mean value is the arithmetic mean of the
425         * sample data, not a known population parameter.  If the mean is a known
426         * population parameter, or if the "population" version of the variance is
427         * desired, set <code>isBiasCorrected</code> to <code>false</code> before
428         * invoking this method.</p>
429         * <p>
430         * Returns 0 for a single-value (i.e. length = 1) sample.</p>
431         * <p>
432         * Throws <code>IllegalArgumentException</code> if the array is null.</p>
433         * <p>
434         * Does not change the internal state of the statistic.</p>
435         *
436         * @param values the input array
437         * @param mean the precomputed mean value
438         * @return the variance of the values or Double.NaN if the array is empty
439         * @throws IllegalArgumentException if the array is null
440         */
441        public double evaluate(final double[] values, final double mean) {
442            return evaluate(values, mean, 0, values.length);
443        }
444    
445        /**
446         * Returns the weighted variance of the entries in the specified portion of
447         * the input array, using the precomputed weighted mean value.  Returns
448         * <code>Double.NaN</code> if the designated subarray is empty.
449         * <p>
450         * Uses the formula <pre>
451         *   Σ(weights[i]*(values[i] - mean)<sup>2</sup>)/(Σ(weights[i]) - 1)
452         * </pre></p>
453         * <p>
454         * The formula used assumes that the supplied mean value is the weighted arithmetic
455         * mean of the sample data, not a known population parameter. This method
456         * is supplied only to save computation when the mean has already been
457         * computed.</p>
458         * <p>
459         * This formula will not return the same result as the unweighted variance when all
460         * weights are equal, unless all weights are equal to 1. The formula assumes that
461         * weights are to be treated as "expansion values," as will be the case if for example
462         * the weights represent frequency counts. To normalize weights so that the denominator
463         * in the variance computation equals the length of the input vector minus one, use <pre>
464         *   <code>evaluate(values, MathUtils.normalizeArray(weights, values.length), mean); </code>
465         * </pre>
466         * <p>
467         * Returns 0 for a single-value (i.e. length = 1) sample.</p>
468         * <p>
469         * Throws <code>IllegalArgumentException</code> if any of the following are true:
470         * <ul><li>the values array is null</li>
471         *     <li>the weights array is null</li>
472         *     <li>the weights array does not have the same length as the values array</li>
473         *     <li>the weights array contains one or more infinite values</li>
474         *     <li>the weights array contains one or more NaN values</li>
475         *     <li>the weights array contains negative values</li>
476         *     <li>the start and length arguments do not determine a valid array</li>
477         * </ul></p>
478         * <p>
479         * Does not change the internal state of the statistic.</p>
480         *
481         * @param values the input array
482         * @param weights the weights array
483         * @param mean the precomputed weighted mean value
484         * @param begin index of the first array element to include
485         * @param length the number of elements to include
486         * @return the variance of the values or Double.NaN if length = 0
487         * @throws IllegalArgumentException if the parameters are not valid
488         * @since 2.1
489         */
490        public double evaluate(final double[] values, final double[] weights,
491                               final double mean, final int begin, final int length) {
492    
493            double var = Double.NaN;
494    
495            if (test(values, weights, begin, length)) {
496                if (length == 1) {
497                    var = 0.0;
498                } else if (length > 1) {
499                    double accum = 0.0;
500                    double dev = 0.0;
501                    double accum2 = 0.0;
502                    for (int i = begin; i < begin + length; i++) {
503                        dev = values[i] - mean;
504                        accum += weights[i] * (dev * dev);
505                        accum2 += weights[i] * dev;
506                    }
507    
508                    double sumWts = 0;
509                    for (int i = 0; i < weights.length; i++) {
510                        sumWts += weights[i];
511                    }
512    
513                    if (isBiasCorrected) {
514                        var = (accum - (accum2 * accum2 / sumWts)) / (sumWts - 1.0);
515                    } else {
516                        var = (accum - (accum2 * accum2 / sumWts)) / sumWts;
517                    }
518                }
519            }
520            return var;
521        }
522    
523        /**
524         * <p>Returns the weighted variance of the values in the input array, using
525         * the precomputed weighted mean value.</p>
526         * <p>
527         * Uses the formula <pre>
528         *   Σ(weights[i]*(values[i] - mean)<sup>2</sup>)/(Σ(weights[i]) - 1)
529         * </pre></p>
530         * <p>
531         * The formula used assumes that the supplied mean value is the weighted arithmetic
532         * mean of the sample data, not a known population parameter. This method
533         * is supplied only to save computation when the mean has already been
534         * computed.</p>
535         * <p>
536         * This formula will not return the same result as the unweighted variance when all
537         * weights are equal, unless all weights are equal to 1. The formula assumes that
538         * weights are to be treated as "expansion values," as will be the case if for example
539         * the weights represent frequency counts. To normalize weights so that the denominator
540         * in the variance computation equals the length of the input vector minus one, use <pre>
541         *   <code>evaluate(values, MathUtils.normalizeArray(weights, values.length), mean); </code>
542         * </pre>
543         * <p>
544         * Returns 0 for a single-value (i.e. length = 1) sample.</p>
545         * <p>
546         * Throws <code>IllegalArgumentException</code> if any of the following are true:
547         * <ul><li>the values array is null</li>
548         *     <li>the weights array is null</li>
549         *     <li>the weights array does not have the same length as the values array</li>
550         *     <li>the weights array contains one or more infinite values</li>
551         *     <li>the weights array contains one or more NaN values</li>
552         *     <li>the weights array contains negative values</li>
553         * </ul></p>
554         * <p>
555         * Does not change the internal state of the statistic.</p>
556         *
557         * @param values the input array
558         * @param weights the weights array
559         * @param mean the precomputed weighted mean value
560         * @return the variance of the values or Double.NaN if length = 0
561         * @throws IllegalArgumentException if the parameters are not valid
562         * @since 2.1
563         */
564        public double evaluate(final double[] values, final double[] weights, final double mean) {
565            return evaluate(values, weights, mean, 0, values.length);
566        }
567    
568        /**
569         * @return Returns the isBiasCorrected.
570         */
571        public boolean isBiasCorrected() {
572            return isBiasCorrected;
573        }
574    
575        /**
576         * @param biasCorrected The isBiasCorrected to set.
577         */
578        public void setBiasCorrected(boolean biasCorrected) {
579            this.isBiasCorrected = biasCorrected;
580        }
581    
582        /**
583         * {@inheritDoc}
584         */
585        @Override
586        public Variance copy() {
587            Variance result = new Variance();
588            copy(this, result);
589            return result;
590        }
591    
592        /**
593         * Copies source to dest.
594         * <p>Neither source nor dest can be null.</p>
595         *
596         * @param source Variance to copy
597         * @param dest Variance to copy to
598         * @throws NullPointerException if either source or dest is null
599         */
600        public static void copy(Variance source, Variance dest) {
601            if (source == null ||
602                dest == null) {
603                throw new NullArgumentException();
604            }
605            dest.setData(source.getDataRef());
606            dest.moment = source.moment.copy();
607            dest.isBiasCorrected = source.isBiasCorrected;
608            dest.incMoment = source.incMoment;
609        }
610    }