Subject: [Boost-bugs] [Boost C++ Libraries] #6057: Lazy variance in accumulator gives wrong result when too high statistics
From: Boost C++ Libraries (noreply_at_[hidden])
Date: 2011-10-25 13:39:37
#6057: Lazy variance in accumulator gives wrong result when too high statistics
---------------------------------------------+------------------------------
Reporter: Johan <johan.luisier@â¦> | Owner: eric_niebler
Type: Bugs | Status: new
Milestone: To Be Determined | Component: accumulator
Version: Boost 1.44.0 | Severity: Problem
Keywords: accumulator lazy_variance |
---------------------------------------------+------------------------------
Hello,
I was trying to compare performances of various ways to compute the
variance of a distribution of integer values, because I need a very fast
one.
I tried to use lazy_variance and variance, a custom algorithm, (and also
TProfile class from ROOT).
What I saw is that using lazy_variance gives the correct mean value but a
weird variance value when the number of events becomes large (n >
0x3FFFF).
The bug was seen first with boost 1.44.0 on a 32 bit Fedora 14 machine
(using gcc 4.5.1), and also with boost 1.46.1 on a 64 bit Mandriva 2011
(using gcc 4.6.1).
Here below is the source code to reproduce the bug (ROOT stuff commented)
:
{{{
#include <iostream>
#include <vector>
#include <cmath>
// accumulators
#include <boost/accumulators/accumulators.hpp>
#include <boost/accumulators/statistics/stats.hpp>
#include <boost/accumulators/statistics/mean.hpp>
#include <boost/accumulators/statistics/variance.hpp>
// root
//#include <TProfile.h>
// timing info
#include <boost/progress.hpp>
// Nbr aléatoires
#include <boost/random/lagged_fibonacci.hpp>
#include <boost/random/variate_generator.hpp>
#include <boost/random/uniform_int.hpp>
using namespace std;
using namespace boost::accumulators;
using namespace boost;
class MyStat
{
public:
MyStat()
: NbrEvt( 0u ), Mean( 0. ), MeanSq( 0. )
{};
~MyStat()
{};
void fill( const int& ev )
{
Mean = ( Mean * NbrEvt + ev ) / double( NbrEvt + 1 );
MeanSq = ( MeanSq * NbrEvt + ev * ev ) / double( NbrEvt + 1 );
NbrEvt++;
};
double mean() const
{
return Mean;
};
double variance() const
{
return MeanSq - Mean * Mean;
};
protected:
long unsigned NbrEvt;
double Mean;
double MeanSq;
};
int main( int argc, char* argv[] )
{
lagged_fibonacci44497 GenerateurBase( 12345 );
uniform_int< int > Distribution( -128, 127 );
variate_generator< lagged_fibonacci44497&,
uniform_int< int > > Generateur( GenerateurBase,
Distribution );
vector< int > Data;
long unsigned i, total( 0xFFFF );
for ( i = 0; i < total; i++ )
Data . push_back( Generateur() );
cout << "Test using " << total << " data points." << endl << endl;
accumulator_set< signed, stats< tag::mean,
tag::lazy_variance > > lazyVar;
accumulator_set< signed, stats< tag::mean,
tag::variance > > immedVar;
//TProfile * profile = new TProfile( "profile", "My profile", 1, -.5,
.5, "S" );
MyStat simpleStat;
progress_timer * chrono = 0;
cout << "Lazy variance accumulator : ";
chrono = new progress_timer();
for ( i = 0; i < total; i++ )
lazyVar( Data[ i ] );
delete chrono;
cout << "Mean : " << extract_result< tag::mean >( lazyVar ) << endl
<< "Variance : " << extract_result< tag::lazy_variance >( lazyVar )
<< endl << endl;
cout << "Immediate variance accumulator : ";
chrono = new progress_timer();
for ( i = 0; i < total; i++ )
immedVar( Data[ i ] );
delete chrono;
cout << "Mean : " << extract_result< tag::mean >( immedVar ) << endl
<< "Variance : "
<< extract_result< tag::variance >( immedVar )
<< endl << endl;
/*
cout << "TProfile : ";
chrono = new progress_timer();
for ( i = 0; i < total; i++ )
profile -> Fill( 0., double( Data[ i ] ) );
delete chrono;
cout << "Mean : " << profile -> GetBinContent( 1 ) << endl
<< "Variance : " << profile -> GetBinError( 1 ) * profile ->
GetBinError( 1 )
<< endl << endl;
*/
cout << "Simple classe : ";
chrono = new progress_timer();
for ( i = 0; i < total; i++ )
simpleStat . fill( Data[ i ] );
delete chrono;
cout << "Mean : " << simpleStat . mean() << endl << "Variance : "
<< simpleStat . variance() << endl << endl << flush;
delete profile;
return 0;
}
}}}
And here is the output :
{{{
Test using 268435455 data points.
Lazy variance accumulator : 0.27 s
Mean : -0.497278
Variance : 5.30971
Immediate variance accumulator : 9.07 s
Mean : -0.497278
Variance : 5461.31
TProfile : 15.56 s
Mean : -0.497278
Variance : 5461.31
Simple classe : 7.28 s
Mean : -0.497278
Variance : 5461.31
}}}
Note that all mean values agree, and all variance values but the lazy one
do too.
Cheers,
johan
-- Ticket URL: <https://svn.boost.org/trac/boost/ticket/6057> Boost C++ Libraries <http://www.boost.org/> Boost provides free peer-reviewed portable C++ source libraries.
This archive was generated by hypermail 2.1.7 : 2017-02-16 18:50:07 UTC