Boost logo

Boost :

From: Matt Hurd (matt_at_[hidden])
Date: 2003-02-27 16:24:54


I see in the Wiki a couple of comments about variance/std dev with n and n-1
being referred to at the denominator. Just to clear it up:

when it is a complete population the denominator should be n.
when it is a random sample it is n-1.

sample variance = sum(Xi - mean(X))^2/(n-1)

or more usefully = (n*sum(Xi^2))-sum(Xi)^2
                    ______________________
                         n(n-1)

For those of you with MS Excel you will see both supported.
________________________________________________________

I note the idea of using accumulators for the various sums to determine
stats in the Wiki.

I would like to see this idea extended...

I think a noble goal would be to accumulate sums over populations in
collection or sliding windows of sequences but only those sums you need.
Like AOP this would be best if done generatively. That is, don't generate
sum(X^2) if you only need a mean. Have state to generate max and min if
needed.
________________________________________________________

Another noble goal would be to harness the power of incremental
calculations:

The simple case is a mean:
        - This is generated from sum(X)
        - add X to the old sum(X) for a collection
        - for a window on a sequence, take the old X off and put the new X on

This seems trivial but adds much more power for calcs, especially windows.

Say you have a one variable linear regression over a window, typically you
would use a library function to calc the stats and it would be O(n). If the
state were accumulated incrementally, then the first window is O(n) and
moving the window a point is O(1).

Say over 10 years of typical data you have 2500 points you are doing a
year's window, 250 points. Using the usual way, calling a library function,
you get, if you pardon my abuse of the notation, O(250 * 2500) or of order
625,000. If you do it incrementally, you get of order of 250 + 2500*1 =
2750 calcs which is a 227 times speed up or 4.4% of the calc.

Interestingly, this can also apply to things like min and max over a window
where you see if the point rolling off is the min or max, if not then you
only have to compare the current min, max to the new point, otherwise you
have to scan...

This is just finite differencing... just like Bresenham's line algorithm.

You sometimes see such incremental approaches in some FORTRAN stats
libraries, but not as often as you should. Also, users often ignore them
anyway as they are typically conditioned to call the regression or whatever.

Below is some code I wrote to do this with an MS compiler before templates
was supported to illustrate the incremental case. This code is oriented to
timeseries of numbers being treated as "streams" for a market trading
application. Please excuse the old historic ugliness...

Now, the idea is to translate such concepts into vogue C++ with policies
etal ;-)

Regards,

matt.

_______________

Using a NR call
Non-incremental
_______________

void FunctorLineFit::evaluate(Offset day) {
  Offset order = Offset(parameter(0));

  float *x = floatVectorGet(day,order,0);
  float *y = floatVectorGet(day,order,1);
  float a,b,siga,sigb,chi2,q;

  fit(x-1,y-1, (int) order, NULL, 0, &a,&b,&siga,&sigb,&chi2,&q);
  setResult(day,0,PriceType(a));
  setResult(day,1,(PriceType)b);
  setResult(day,2,(PriceType)siga);
  setResult(day,3,(PriceType)sigb);
  setResult(day,4,(PriceType)chi2);

  floatVectorRelease(x,0);
  floatVectorRelease(y,1);
} // FunctorLineFit

______________________

Using incremental calc
______________________

class FunctorLineFitSeriesIncremental : public FunctorStored {

  RWDECLARE_COLLECTABLE(FunctorLineFitSeriesIncremental)

  public:
    virtual Offset waste() const { return Offset(parameter(0)); };
    virtual void evaluate(Offset j);

  private:
    virtual void setPrivate();
    PriceType
n,n2,n3,sumX,sumXX,a1,a2,b1,b2,np1,syx1,syx2,syx3,syx4,seny1,r21,ar21,pb1,pb
2,pb3;
    Offset intN;

};

void FunctorLineFitSeriesIncremental::evaluate(Offset day) {
  assert(invariant());

  Offset counter;

  PriceType sumY, sumYY, sumXY;
  PriceType a,b,yx,syx,seny,r2,ar2,pb,yxLow, yxHigh;
  PriceType oldY, newY, oldXY, newXY;
  PriceType sumY2, syxTemp, r2Temp, t, t2;
  PriceType temp;

  // This code relies on the left to right conditional evaluation of &&
  if (day>0 && cached(day-1)) {
    // Use tomorrow's data to calculate today.
    oldY = datum(day-1,0);
    newY = datum(day+intN-1,0);

    sumY = value(day-1,10) - oldY + newY;
    sumYY = value(day-1,11) - oldY*oldY + newY*newY;

    oldXY = n*oldY;
    newXY = sumY;
    sumXY = value(day-1,12) - oldXY + newXY;

  } else if (day<length()-1 && cached(day+1)) {
    // Use yesterday's data to calculate today.
    newY = datum(day,0);
    oldY = datum(day+intN,0);

    oldXY = value(day+1,10);
    sumY = oldXY - oldY + newY;
    sumYY = value(day+1,11) - oldY*oldY + newY*newY;

    newXY = n*newY;
    sumXY = value(day+1,12) - oldXY + newXY;

  } else {
    // Calculate today from first principles
    setPrivate();
    sumY = sumYY = sumXY = 0.0;
    for (counter = 0; counter<intN; counter++) {
      newY = datum(day+counter,0);

      sumY += newY;
      sumYY += newY*newY;
      sumXY += (n-counter)*newY;
    } // for
  } // if

  sumY2 = sumY*sumY;
  a = a1*sumY-a2*sumXY;
  b = b1*sumXY-b2*sumY;
  yx = np1*b+a;

  syxTemp = sumXY-syx4*sumY;
        temp = syx1*sumYY -syx2*sumY2 -syx3*syxTemp*syxTemp;
        if (temp<eps) temp = 0.0;
  syx = sqrt(temp);

  seny = syx*seny1;
  yxHigh = yx + seny;
  yxLow = yx - seny;
  r2Temp = 2*sumXY - np1*sumY;
  r2 = 3*r2Temp*r2Temp/(r21*(n*sumYY-sumY2));

  ar2 = 1 - ar21*(1-r2);

        t = (temp!=0.0)? (b*pb1/syx) : (b*pb1);
  t2 = t*t;
  pb = (PriceType) betai( float(pb2), 0.5, float(pb3/(pb3+t2)) );

  setResult(day,0,b);
  setResult(day,1,a);
  setResult(day,2,yx);
  setResult(day,3,syx);
  setResult(day,4,seny);
  setResult(day,5,yxLow);
  setResult(day,6,yxHigh);
  setResult(day,7,r2);
  setResult(day,8,ar2);
  setResult(day,9,pb);
  setResult(day,10,sumY);
  setResult(day,11,sumYY);
  setResult(day,12,sumXY);
}

void FunctorLineFitSeriesIncremental::setPrivate() {
  n = parameter(0);
  n2 = n*n;
  n3 = n2*n;

  sumX = (n2+n)/2;
  sumXX = n*(n+1)*(2*n+1)/6;

  a1 = (4*n+2)/(n2-n);
  a2 = 6/(n2-n);

  b1 = 12/(n3-n);
  b2 = b1/2*(n+1);

  np1 = n+1;

  syx1 = 1/(n-2);
  syx2 = syx1/n;
  syx3 = syx2*12/(n2-1);
  syx4 = np1/2;

  seny1 = sqrt((n2+3*n+2)/(n2-n));

  r21 = n2-1;

  ar21 = (n-1)/(n-2);

  pb1 = sqrt((n3-n)/12);
  pb3 = n-2;
  pb2 = pb3/2;
  intN = Offset(n);
} // setPrivate

_______________________

Autocorrelation example
_______________________

#include "stdafx.h"
#include "f_autcor.h"
#include "math.h"

RWDEFINE_COLLECTABLE(FunctorAutocorrelation,FTR_ISA_F_AUTCOR)

/*--------------------------------------------------------------------------
-------------------
  Evaluate.

  Note:
    This autocorrelation calculation supports left or right incremental
calculation.

----------------------------------------------------------------------------
-----------------*/
void FunctorAutocorrelation::evaluate(Offset day) {
  assert(invariant());

  Offset counter;
  Offset terminateCondition;
  Offset n = Offset(parameter(0));
  Offset lag = Offset(parameter(1));

  PriceType sumX, sumXX, sumXbottom, sumXtop, sumXXlag;
  PriceType covar, var, autocorr, mean;
  PriceType oldX, newX, oldXbottom, newXbottom, oldXtop, newXtop, oldXXlag,
newXXlag;

  // This code relies on the left to right conditional evaluation of &&
  if (day>0 && cached(day-1)) {
    // Use tomorrow's data to calculate today.
    oldX = datum(day-1,0);
    newX = datum(day+n-1,0);

    oldXbottom = oldX;
    newXbottom = datum(day+lag-1,0);

    oldXtop = datum(day-1+n-lag,0);
    newXtop = newX;

    oldXXlag = oldX*datum(day-1+lag,0);
    newXXlag = datum(day+n-lag-1,0)*newX;

    sumX = value(day-1,5) - oldX + newX;
    sumXX = value(day-1,6) - oldX*oldX + newX*newX;
    sumXbottom = value(day-1,7) - oldXbottom + newXbottom;
    sumXtop = value(day-1,8) - oldXtop + newXtop;
    sumXXlag = value(day-1,9) - oldXXlag + newXXlag;

  } else if (day<length()-1 && cached(day+1)) {
    // Use yesterday's data to calculate today.
    newX = datum(day,0);
    oldX = datum(day+n,0);

    newXbottom = newX;
    oldXbottom = datum(day+lag,0);

    newXtop = datum(day+n-lag,0);
    oldXtop = oldX;

    newXXlag = newX*datum(day+lag,0);
    oldXXlag = datum(day+n-lag,0)*oldX;

    sumX = value(day+1,5) - oldX + newX;
    sumXX = value(day+1,6) - oldX*oldX + newX*newX;
    sumXbottom = value(day+1,7) - oldXbottom + newXbottom;
    sumXtop = value(day+1,8) - oldXtop + newXtop;
    sumXXlag = value(day+1,9) - oldXXlag + newXXlag;

  } else {
    // Calculate today from first principles
    sumX = sumXX = sumXbottom = sumXtop = sumXXlag = 0.0;

    for (counter = 0; counter<lag; counter++) {
      newX = datum(day+counter,0);
      sumX += newX;
      sumXX += newX*newX;
      newXXlag = newX*datum(day+counter+lag,0);
      sumXXlag += newXXlag;
    } // for
    sumXbottom = sumX;

    terminateCondition = n-lag;
    for (;counter<terminateCondition; counter++) {
      newX = datum(day+counter,0);
      sumX += newX;
      sumXX += newX*newX;
      newXXlag = newX*datum(day+counter+lag,0);
      sumXXlag += newXXlag;
    } // for

    sumXtop = sumX;
    for (;counter<n; counter++) {
      newX = datum(day+counter,0);
      sumX += newX;
      sumXX += newX*newX;
    } // for
    sumXtop = sumX - sumXtop;
  } // if

  mean = sumX/n;
  var = sumXX-sumX*mean;
  covar = sumXXlag - mean*((n+lag)*mean - sumXbottom - sumXtop);
  autocorr = covar/var;
  var /= (n-1);
  covar /= (n-lag-1);

  setResult(day,0,autocorr);
  setResult(day,1,covar);
  setResult(day,2,var);
  setResult(day,3,sqrt(var));
  setResult(day,4,mean);
  setResult(day,5,sumX);
  setResult(day,6,sumXX);
  setResult(day,7,sumXbottom);
  setResult(day,8,sumXtop);
  setResult(day,9,sumXXlag);
}


Boost list run by bdawes at acm.org, gregod at cs.rpi.edu, cpdaniel at pacbell.net, john at johnmaddock.co.uk