Boost :

From: Csaba Szepesvari (cszepes_at_[hidden])
Date: 2003-05-15 13:44:38

Hi,
Once I played with comparing Intel Math kernel with some implementations I
did.
I found out that loop unrolling may double the speed.
Still I could never quite get close to the speech of the Intel Math kernel
(for large matrices), presumably due to insufficient caching.
The above applies to row-major matrices.
For column-major matrices loop unrolling achieved the same speed as the
intel math kernel.

Below is the code if anyone wants to give it a try.
Maybe ublas could make use of some performance optimizations.. (better it
should be connected
to some optimized blas implementations..?)

- Csaba
PS: The entry-point is
void blas_compact_matvec_mul( const T* a, const T* b, T* c, int rows, int
cols )
- here a is a matrix, b is a vector, result is put into c.
----------------------------------------------------------------------------
-----------------------
inline int blas_get_index( int cols, int row_i, int col_i )
{
return row_i*cols+col_i;
}

inline int blas_get_col_index( int rows, int row_i, int col_i )
{
return row_i+col_i*rows;
}

template <class T>
void blas_compact_matvec_mul0( const T* a, const T* b, T* c, int rows, int
cols )
{
# undef A
# define A(i,j) (a[blas_get_col_index(rows,i,j)])
for (int i=0; i<rows; ++i) c[i] = T(0);
for (int j=0; j<cols; ++j)
{
for (int i=0; i<rows; i+=4)
{
c[i] += b[j] * A(i,j);
c[i+1] += b[j] * A(i+1,j);
c[i+2] += b[j] * A(i+2,j);
c[i+3] += b[j] * A(i+3,j);
}
}
# undef A
}

template <class T>
void blas_compact_matvec_mul1( const T* a, const T* b, T* c, int rows, int
cols )
{
# undef A
# define A(i,j) (a[blas_get_col_index(rows,i,j)])
int rows4 = rows-1;
for (int i=0; i<rows; ++i) c[i] = T(0);
for (int j=0; j<cols; ++j)
{
for (int i=0; i<rows4; i+=4)
{
c[i] += b[j] * A(i,j);
c[i+1] += b[j] * A(i+1,j);
c[i+2] += b[j] * A(i+2,j);
c[i+3] += b[j] * A(i+3,j);
}
c[i] += b[j] * A(i,j);
}
# undef A
}

template <class T>
void blas_compact_matvec_mul2( const T* a, const T* b, T* c, int rows, int
cols )
{
# undef A
# define A(i,j) (a[blas_get_col_index(rows,i,j)])
int rows4 = rows-2;
for (int i=0; i<rows; ++i) c[i] = T(0);
for (int j=0; j<cols; ++j)
{
for (int i=0; i<rows4; i+=4)
{
c[i] += b[j] * A(i,j);
c[i+1] += b[j] * A(i+1,j);
c[i+2] += b[j] * A(i+2,j);
c[i+3] += b[j] * A(i+3,j);
}
c[i] += b[j] * A(i,j);
c[i+1] += b[j] * A(i+1,j);
}
# undef A
}

template <class T>
void blas_compact_matvec_mul3( const T* a, const T* b, T* c, int rows, int
cols )
{
# undef A
# define A(i,j) (a[blas_get_col_index(rows,i,j)])
int rows4 = rows-3;
for (int i=0; i<rows; ++i) c[i] = T(0);
for (int j=0; j<cols; ++j)
{
for (int i=0; i<rows4; i+=4)
{
c[i] += b[j] * A(i,j);
c[i+1] += b[j] * A(i+1,j);
c[i+2] += b[j] * A(i+2,j);
c[i+3] += b[j] * A(i+3,j);
}
c[i] += b[j] * A(i,j);
c[i+1] += b[j] * A(i+1,j);
c[i+2] += b[j] * A(i+2,j);
}
# undef A
}

template <class T>
void blas_compact_matvec_mul( const T* a, const T* b, T* c, int rows, int
cols )
{
int rows_r4 = (rows & 0x03);
switch (rows_r4)
{
case 0: blas_compact_matvec_mul0( a,b,c, rows, cols); break;
case 1: blas_compact_matvec_mul1( a,b,c, rows, cols); break;
case 2: blas_compact_matvec_mul2( a,b,c, rows, cols); ; break;
default: blas_compact_matvec_mul3( a,b,c, rows, cols);
}
}

__________________________________
Do you Yahoo!?
The New Yahoo! Search - Faster. Easier. Bingo.
http://search.yahoo.com