Hi list,

I'm profiling some code and as far as I can tell the main problem I have is with the overhead of matrix indexing. My test program is a simple matrix multiplication that uses ublas matrix indexing to access elements, compared to using C style raw pointers.

The test program shows that C style indexing using pointers is much faster. I'm a bit worried because I've always believed that the optimizer was normally smart enough to 'unwrap' C++ code, look inside and determine that it all boils down to simply indexing a pointer. But it seems that this is not true in my test program...

Any hints? Ideas?
I've compiled with all possible optimizations I could think of, but pointers always beat matrix indexing.


#define NDEBUG
#define BOOST_UBLAS_NDEBUG

#include <boost/numeric/ublas/matrix.hpp>

namespace ublas = boost::numeric::ublas;

void indexing(const ublas::matrix<double>& A,
              const ublas::matrix<double>& B,
              ublas::matrix<double>& C)
{
    C = ublas::zero_matrix<double>( A.size1(), B.size2() );
    const size_t size1 = A.size1();
    const size_t size2 = B.size2();
    const size_t size3 = A.size2();
    for (size_t i = 0; i < size1; ++i)
        for (size_t k = 0; k < size3; ++k)
            for (size_t j = 0; j < size2; ++j)
                C(i,j) += A(i,k) * B(k,j);
}

void pointers(const ublas::matrix<double>& A,
              const ublas::matrix<double>& B,
              ublas::matrix<double>& C)
{
    C = ublas::zero_matrix<double>( A.size1(), B.size2() );
   
    const double* a = &A(0,0);
    const double* b = &B(0,0);
    double* c = &C(0,0);

    const size_t size1 = A.size1();
    const size_t size2 = B.size2();
    const size_t size3 = A.size2();

    for (size_t i = 0; i < size1; ++i)
        for (size_t k = 0; k < size3; ++k)
            for (size_t j = 0; j < size2; ++j)
                c[i*size2+j] += a[i*size3+k] * b[k*size2+j];
}

const size_t N = 100;
const size_t M = 30000;

int main(int argc, char* argv[])
{
    ublas::matrix<double> A(N, N), B(N, M), C;
    for (size_t i = 0; i < A.size1(); ++i)
        for (size_t j = 0; j < A.size2(); ++j)
            A(i,j) = (i + j) / 100.0 + 1 - j;
    for (size_t i = 0; i < B.size1(); ++i)
        for (size_t j = 0; j < B.size2(); ++j)
            B(i,j) = (i - j) / 1000.0 + 1;

    if (argc > 1)
        pointers(A, B, C);
    else
        indexing(A, B, C);

    return 0;
}