Boost logo

Boost :

From: Csaba Szepesvari (cszepes_at_[hidden])
Date: 2003-05-15 13:44:38


Hi,
Once I played with comparing Intel Math kernel with some implementations I
did.
I found out that loop unrolling may double the speed.
Still I could never quite get close to the speech of the Intel Math kernel
(for large matrices), presumably due to insufficient caching.
The above applies to row-major matrices.
For column-major matrices loop unrolling achieved the same speed as the
intel math kernel.

Below is the code if anyone wants to give it a try.
Maybe ublas could make use of some performance optimizations.. (better it
should be connected
to some optimized blas implementations..?)

- Csaba
PS: The entry-point is
void blas_compact_matvec_mul( const T* a, const T* b, T* c, int rows, int
cols )
- here a is a matrix, b is a vector, result is put into c.
----------------------------------------------------------------------------
-----------------------
inline int blas_get_index( int cols, int row_i, int col_i )
{
  return row_i*cols+col_i;
}

inline int blas_get_col_index( int rows, int row_i, int col_i )
{
  return row_i+col_i*rows;
}

template <class T>
void blas_compact_matvec_mul0( const T* a, const T* b, T* c, int rows, int
cols )
{
# undef A
# define A(i,j) (a[blas_get_col_index(rows,i,j)])
  for (int i=0; i<rows; ++i) c[i] = T(0);
  for (int j=0; j<cols; ++j)
  {
    for (int i=0; i<rows; i+=4)
    {
      c[i] += b[j] * A(i,j);
      c[i+1] += b[j] * A(i+1,j);
      c[i+2] += b[j] * A(i+2,j);
      c[i+3] += b[j] * A(i+3,j);
    }
  }
# undef A
}

template <class T>
void blas_compact_matvec_mul1( const T* a, const T* b, T* c, int rows, int
cols )
{
# undef A
# define A(i,j) (a[blas_get_col_index(rows,i,j)])
  int rows4 = rows-1;
  for (int i=0; i<rows; ++i) c[i] = T(0);
  for (int j=0; j<cols; ++j)
  {
    for (int i=0; i<rows4; i+=4)
    {
      c[i] += b[j] * A(i,j);
      c[i+1] += b[j] * A(i+1,j);
      c[i+2] += b[j] * A(i+2,j);
      c[i+3] += b[j] * A(i+3,j);
    }
    c[i] += b[j] * A(i,j);
  }
# undef A
}

template <class T>
void blas_compact_matvec_mul2( const T* a, const T* b, T* c, int rows, int
cols )
{
# undef A
# define A(i,j) (a[blas_get_col_index(rows,i,j)])
  int rows4 = rows-2;
  for (int i=0; i<rows; ++i) c[i] = T(0);
  for (int j=0; j<cols; ++j)
  {
    for (int i=0; i<rows4; i+=4)
    {
      c[i] += b[j] * A(i,j);
      c[i+1] += b[j] * A(i+1,j);
      c[i+2] += b[j] * A(i+2,j);
      c[i+3] += b[j] * A(i+3,j);
    }
    c[i] += b[j] * A(i,j);
    c[i+1] += b[j] * A(i+1,j);
  }
# undef A
}

template <class T>
void blas_compact_matvec_mul3( const T* a, const T* b, T* c, int rows, int
cols )
{
# undef A
# define A(i,j) (a[blas_get_col_index(rows,i,j)])
  int rows4 = rows-3;
  for (int i=0; i<rows; ++i) c[i] = T(0);
  for (int j=0; j<cols; ++j)
  {
    for (int i=0; i<rows4; i+=4)
    {
      c[i] += b[j] * A(i,j);
      c[i+1] += b[j] * A(i+1,j);
      c[i+2] += b[j] * A(i+2,j);
      c[i+3] += b[j] * A(i+3,j);
    }
    c[i] += b[j] * A(i,j);
    c[i+1] += b[j] * A(i+1,j);
    c[i+2] += b[j] * A(i+2,j);
  }
# undef A
}

template <class T>
void blas_compact_matvec_mul( const T* a, const T* b, T* c, int rows, int
cols )
{
  int rows_r4 = (rows & 0x03);
  switch (rows_r4)
  {
    case 0: blas_compact_matvec_mul0( a,b,c, rows, cols); break;
    case 1: blas_compact_matvec_mul1( a,b,c, rows, cols); break;
    case 2: blas_compact_matvec_mul2( a,b,c, rows, cols); ; break;
    default: blas_compact_matvec_mul3( a,b,c, rows, cols);
  }
}

__________________________________
Do you Yahoo!?
The New Yahoo! Search - Faster. Easier. Bingo.
http://search.yahoo.com


Boost list run by bdawes at acm.org, gregod at cs.rpi.edu, cpdaniel at pacbell.net, john at johnmaddock.co.uk