Boost logo

Boost-Commit :

From: pbristow_at_[hidden]
Date: 2008-06-10 12:27:10


Author: pbristow
Date: 2008-06-10 12:27:10 EDT (Tue, 10 Jun 2008)
New Revision: 46296
URL: http://svn.boost.org/trac/boost/changeset/46296

Log:
See test_quantiles.cpp for tests. To be added to boxplot.hpp
Added:
   sandbox/SOC/2007/visualization/boost/svg_plot/quantile.hpp (contents, props changed)

Added: sandbox/SOC/2007/visualization/boost/svg_plot/quantile.hpp
==============================================================================
--- (empty file)
+++ sandbox/SOC/2007/visualization/boost/svg_plot/quantile.hpp 2008-06-10 12:27:10 EDT (Tue, 10 Jun 2008)
@@ -0,0 +1,108 @@
+// quantile.cpp
+
+// Copyright Paul A. Bristow 2008
+
+// Use, modification and distribution are subject to the
+// Boost Software License, Version 1.0.
+// (See accompanying file LICENSE_1_0.txt
+// or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+// Estimate p th quantile of data using one of 5 definitions.
+// Default is the recommendation of Hyndman and Fan = definition #8.
+// Use p = 0.25 and p = 0.75 to get quartiles for boxplots.
+
+namespace boost
+{
+namespace svg
+{
+
+double quantile(vector<double>& data, double p, int HF_definition = 8)
+{ // p=fraction of population, percentile, e.g. p=0.25 for 1st quartile.
+ // Usually double p = 0.25, or p = 0.75 for boxplots
+ // Data must be ordered minimum to maximum.
+ size_t n = data.size();
+ // The interquartile range is calculated using the 1st & 3rd sample quartiles,
+ // but there are various ways to calculate those quartiles.
+ // Five of Hyndman and Fan's sample quantile definitions
+ // Rob J. Hyndman and Yanan Fan, 1996, "Sample Quantiles in Statistical Packages",
+ // The American Statistician 50(4):361-365, (1996).
+ // have a particularly simple common form given by the VBA code given.
+ // You select among the methods according to which definition of m is chosen.
+ // from Function quantile(data, p) in Visual Basic from
+ // http://www.pcreview.co.uk/forums/thread-3494699.php
+ // Jerry W. Lewis
+ double m;
+ switch (HF_definition)
+ { // Hyndman and Fan definitions:
+ case 4: // H&F 4 SAS (PCTLDEF=1), R (type=4), Maple (method=3)
+ m = 0; // Small IQR
+ break;
+ case 5: // H&F 5: R (type=5), Maple (method=4), Wolfram Mathematica quartiles
+ // http://support.wolfram.com/archive/mathematica/quartilesnotes.html
+ // "Symmetric linear interpolation a common choice when the data represent a sample
+ // from a continuous distribution and
+ // you want an unbiased estimate of the quartiles of that distribution."
+ m = 0.5; // moderate IRQ
+ break;
+ case 6: // H&F 6: Minitab, SPSS, BMDP, JMP, SAS (PCTLDEF=4), R(type=6), Maple (method=5).
+ // calculates the 1st (3rd) quartile
+ // as the median of the lower (upper) "half" of the sample.
+ // This "half" sample excludes the sample median (k observations) for odd n (=2*k+1).
+ // This will tend to be a better estimate for the population quartiles,
+ // but will tend to give quartile estimates that are a bit too far
+ // from the center of the whole sample (too wide an interquartile range).
+ m = p; // Biggest IQR
+ break;
+ case 7:// H&F 7: Excel, S-Plus, R (type=7[default]), Maxima, Maple (method=6).
+ m = 1 - p; // Smallest IQR
+ // For a continuous distribution,
+ // this will tend to give too narrow an interquartile range,
+ // since there will tend to be a small fraction of the population beyond the extreme
+ // sample observations. In particular, for odd n (=2*k+1), Excel calculates the
+ // 1st (3rd) quartile as the median of the lower (upper) "half" of the sample
+ // including the sample median (k+1 observations).
+ break;
+ case 8: // H&F 8: R (type=8), Maple (method=7[default]).
+ m = (p + 1) / 3;
+ break; // Middling IQR
+ // Definition 8 recommended by H&F because
+ // "it is approximately median-unbaised estimate regardless of distribution"
+ // and thus suitable for continuous and discrete distributions.
+ // (Maple's default definition),
+ // which gives quartiles between those reported by Minitab and Excel. This
+ // approach is approximately median unbiased for continuous distributions.
+ case 9: m = (p + 1.5) / 4; // H&F 9: R (type=9), Maple (method=8).
+ break;
+ default:
+ m = p; // == case H&F 6: Minitab, SPSS, BMDP, JMP, SAS (PCTLDEF=4), R(type=6), Maple (method=5).
+ }
+ double npm = n * p + m; // 9 * 0.25 + 0.25
+ size_t j = static_cast<size_t>(npm); // j = Fix(npm): fix returns integer part.
+ // Index for data array.
+ //if (j == 0)
+ //{ // If j = 0 Then j = 1 // assumes 1 based ...
+ // j = 1;
+ //}
+ // but J can't be < 0 so check not needed.
+ if (j >= n)
+ { // If j > n Then j = n
+ j = n -1; // Limit to last element of data.
+ }
+
+ double g = npm - j; // fractional part of npm = 0.5
+ //// quantile = WorksheetFunction.Small(data, j) Returns the k-th smallest value in a data set.
+ //if ((g >= 0) && (j <= n))
+ //{ // Need to interpolate.
+ // quantile = (1 - g) * quantile + g * series[j]; // WorksheetFunction.Small(data, j + 1)
+ //}
+ double quantile = data[j - 1]; // VBA example appears to be 1-based (not default 0)! So j-1.
+ if ((g >= 0) && ((j-1) < n))
+ { // Need (and can) to interpolate.
+ quantile = (1 - g) * quantile + g * data[j]; // VBA example appears to be 1-based (not default 0)! So NOT j+1.
+ }
+ //cout << "Quantile(" << p << ") = " << quantile << endl;
+ return quantile;
+} // double quantile(vector<double>& data, double p)
+
+ } // namespace boost
+} // namespace svg


Boost-Commit list run by bdawes at acm.org, david.abrahams at rcn.com, gregod at cs.rpi.edu, cpdaniel at pacbell.net, john at johnmaddock.co.uk